From 45b2fe194c6fd116fdaf7c0e1c677187c84aea5b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 12 Nov 2025 18:48:40 +0100
Subject: [PATCH 01/21] server: split HTTP into its own interface

---
 tools/server/server-http.h |  362 ++++++++++
 tools/server/server.cpp    | 1348 +++++-------------------------------
 tools/server/utils.hpp     |   22 +-
 3 files changed, 531 insertions(+), 1201 deletions(-)
 create mode 100644 tools/server/server-http.h
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
new file mode 100644
index 0000000000000..5d93b94b6ce95
--- /dev/null
+++ b/tools/server/server-http.h
@@ -0,0 +1,362 @@
+#pragma once
+
+#include "utils.hpp"
+#include "common.h"
+
+#include <functional>
+#include <string>
+#include <thread>
+
+// auto generated files (see README.md for details)
+#include "index.html.gz.hpp"
+#include "loading.html.hpp"
+
+// generator-like API for HTTP response generation
+struct server_http_resgen {
+    std::string content_type = "application/json; charset=utf-8";
+    int status = 200;
+    std::string data;
+
+    // if is_stream is true, next() will return true until the stream ends
+    // the data member will contain the next chunk of data to send
+    // TODO: move this to a virtual function once we have proper polymorphism support
+    std::function<bool()> next = nullptr;
+    bool is_stream() const {
+        return next != nullptr;
+    }
+
+    virtual ~server_http_resgen() = default;
+};
+
+// unique pointer, used by set_chunked_content_provider
+// we need to use unique_ptr because httplib requires the stream provider to be stored in heap
+using server_http_resgen_ptr = std::unique_ptr<server_http_resgen>;
+
+struct server_http_request {
+    std::unordered_map<std::string, std::string> query_params;
+    json body;
+    const std::function<bool()> & should_stop;
+};
+
+struct server_http_context {
+    std::thread thread;
+    std::unique_ptr<httplib::Server> svr;
+    std::atomic<bool> is_ready = false;
+
+    std::string path_prefix;
+    std::string hostname;
+    int port;
+
+    bool init(const common_params & params);
+    bool start();
+    void stop();
+
+    using handler_t = std::function<server_http_resgen_ptr(const server_http_request & req)>;
+    void get(const std::string &, handler_t);
+    void post(const std::string &, handler_t);
+};
+
+// implementation details
+
+static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
+    // skip GH copilot requests when using default port
+    if (req.path == "/v1/health") {
+        return;
+    }
+
+    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
+
+    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
+}
+
+bool server_http_context::init(const common_params & params) {
+    path_prefix = params.api_prefix;
+    port = params.port;
+    hostname = params.hostname;
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
+        svr.reset(
+            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
+        );
+    } else {
+        LOG_INF("Running without SSL\n");
+        svr.reset(new httplib::Server());
+    }
+#else
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_ERR("Server is built without SSL support\n");
+        return false;
+    }
+    svr.reset(new httplib::Server());
+#endif
+
+    svr->set_default_headers({{"Server", "llama.cpp"}});
+    svr->set_logger(log_server_request);
+    svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
+        std::string message;
+        try {
+            std::rethrow_exception(ep);
+        } catch (const std::exception & e) {
+            message = e.what();
+        } catch (...) {
+            message = "Unknown Exception";
+        }
+
+        // FIXME
+        GGML_UNUSED(res);
+        GGML_UNUSED(message);
+        // try {
+        //     json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
+        //     LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
+        //     res_error(res, formatted_error);
+        // } catch (const std::exception & e) {
+        //     LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
+        // }
+    });
+
+    svr->set_error_handler([](const httplib::Request &, httplib::Response & res) {
+        if (res.status == 404) {
+            // FIXME
+            //res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND));
+            res.set_content("404 Not Found", "text/plain");
+        }
+        // for other error codes, we skip processing here because it's already done by res_error()
+    });
+
+    // set timeouts and change hostname and port
+    svr->set_read_timeout (params.timeout_read);
+    svr->set_write_timeout(params.timeout_write);
+
+    if (params.api_keys.size() == 1) {
+        auto key = params.api_keys[0];
+        std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
+        LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str());
+    } else if (params.api_keys.size() > 1) {
+        LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size());
+    }
+
+    //
+    // Middlewares
+    //
+
+    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
+        static const std::unordered_set<std::string> public_endpoints = {
+            "/health",
+            "/v1/health",
+            "/models",
+            "/v1/models",
+            "/api/tags"
+        };
+
+        // If API key is not set, skip validation
+        if (api_keys.empty()) {
+            return true;
+        }
+
+        // If path is public or is static file, skip validation
+        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
+            return true;
+        }
+
+        // Check for API key in the header
+        auto auth_header = req.get_header_value("Authorization");
+
+        std::string prefix = "Bearer ";
+        if (auth_header.substr(0, prefix.size()) == prefix) {
+            std::string received_api_key = auth_header.substr(prefix.size());
+            if (std::find(api_keys.begin(), api_keys.end(), received_api_key) != api_keys.end()) {
+                return true; // API key is valid
+            }
+        }
+
+        // API key is invalid or not provided
+        //res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
+        // FIXME
+        res.status = 401;
+        res.set_content("Unauthorized: Invalid API Key", "text/plain");
+
+        LOG_WRN("Unauthorized: Invalid API Key\n");
+
+        return false;
+    };
+
+    auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
+        bool ready = is_ready.load();
+        if (!ready) {
+            auto tmp = string_split<std::string>(req.path, '.');
+            if (req.path == "/" || tmp.back() == "html") {
+                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
+                res.status = 503;
+            } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
+                // allow the models endpoint to be accessed during loading
+                return true;
+            } else {
+                // FIXME
+                //res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+                res.status = 503;
+                res.set_content("503 Service Unavailable: Loading model", "text/plain");
+            }
+            return false;
+        }
+        return true;
+    };
+
+    // register server middlewares
+    svr->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) {
+        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
+        if (req.method == "OPTIONS") {
+            res.set_header("Access-Control-Allow-Credentials", "true");
+            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
+            res.set_header("Access-Control-Allow-Headers",     "*");
+            res.set_content("", "text/html"); // blank response, no data
+            return httplib::Server::HandlerResponse::Handled; // skip further processing
+        }
+        if (!middleware_server_state(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        if (!middleware_validate_api_key(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        return httplib::Server::HandlerResponse::Unhandled;
+    });
+
+    int n_threads_http = params.n_threads_http;
+    if (n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http);
+    svr->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); };
+
+    //
+    // Web UI setup
+    //
+
+    if (!params.webui) {
+        LOG_INF("Web UI is disabled\n");
+    } else {
+        // register static assets routes
+        if (!params.public_path.empty()) {
+            // Set the base directory for serving static files
+            bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path);
+            if (!is_found) {
+                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+                return 1;
+            }
+        } else {
+            // using embedded static index.html
+            svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
+                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
+                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
+                } else {
+                    res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
+                }
+                return false;
+            });
+        }
+    }
+    return true;
+}
+
+bool server_http_context::start() {
+    // Bind and listen
+
+    bool was_bound = false;
+    bool is_sock = false;
+    if (string_ends_with(std::string(hostname), ".sock")) {
+        is_sock = true;
+        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
+        svr->set_address_family(AF_UNIX);
+        // bind_to_port requires a second arg, any value other than 0 should
+        // simply get ignored
+        was_bound = svr->bind_to_port(hostname, 8080);
+    } else {
+        LOG_INF("%s: binding port with default address family\n", __func__);
+        // bind HTTP listen port
+        if (port == 0) {
+            int bound_port = svr->bind_to_any_port(hostname);
+            if ((was_bound = (bound_port >= 0))) {
+                port = bound_port;
+            }
+        } else {
+            was_bound = svr->bind_to_port(hostname, port);
+        }
+    }
+
+    if (!was_bound) {
+        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port);
+        return false;
+    }
+
+    // run the HTTP server in a thread
+    thread = std::thread([this]() { svr->listen_after_bind(); });
+    svr->wait_until_ready();
+
+    LOG_INF("%s: server is listening on %s\n", __func__,
+            is_sock ? string_format("unix://%s",    hostname.c_str()).c_str() :
+                      string_format("http://%s:%d", hostname.c_str(), port).c_str());
+    return true;
+}
+
+void server_http_context::stop() {
+    if (svr) {
+        svr->stop();
+    }
+}
+
+void server_http_context::get(const std::string & path, server_http_context::handler_t handler) {
+    svr->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_resgen_ptr response = handler(server_http_request{
+            req.path_params,
+            json{},
+            req.is_connection_closed
+        });
+        GGML_ASSERT(!response->is_stream() && "not supported for GET method");
+        res.status = response->status;
+        res.set_content(response->data, response->content_type);
+    });
+}
+
+void server_http_context::post(const std::string & path, server_http_context::handler_t handler) {
+    svr->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_resgen_ptr response = handler(server_http_request{
+            req.path_params,
+            json::parse(req.body.empty() ? "{}" : req.body),
+            req.is_connection_closed
+        });
+        if (response->is_stream()) {
+            res.status = response->status;
+            std::string content_type = response->content_type;
+            // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
+            std::shared_ptr<server_http_resgen> r_ptr = std::move(response);
+            const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
+                // TODO: maybe handle sink.write unsuccessful case? for now, we rely on is_connection_closed()
+                sink.write(response->data.data(), response->data.size());
+                SRV_DBG("http: streamed chunk: %s\n", response->data.c_str());
+                if (!response->next()) {
+                    SRV_DBG("%s", "http: stream ended\n");
+                    sink.done();
+                    return false; // end of stream
+                }
+                return true;
+            };
+            const auto on_complete = [response = r_ptr](bool) mutable {
+                response.reset(); // trigger the destruction of the response object
+            };
+            res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
+        } else {
+            res.status = response->status;
+            res.set_content(response->data, response->content_type);
+        }
+    });
+}
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 0b3c77879c2e2..e4f0167fa838c 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1,5 +1,6 @@
 #include "chat.h"
 #include "utils.hpp"
+#include "server-http.h"
 
 #include "arg.h"
 #include "common.h"
@@ -10,13 +11,6 @@
 #include "speculative.h"
 #include "mtmd.h"
 
-// mime type for sending response
-#define MIMETYPE_JSON "application/json; charset=utf-8"
-
-// auto generated files (see README.md for details)
-#include "index.html.gz.hpp"
-#include "loading.html.hpp"
-
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
@@ -2382,6 +2376,7 @@ struct server_context {
         llama_batch_free(batch);
     }
 
+    // load the model and initialize llama_context
     bool load_model(const common_params & params) {
         SRV_INF("loading model '%s'\n", params.model.path.c_str());
 
@@ -2500,6 +2495,7 @@ struct server_context {
         return true;
     }
 
+    // initialize slots and server-related data
     void init() {
         SRV_INF("initializing slots, n_slots = %d\n", params_base.n_parallel);
 
@@ -4319,6 +4315,7 @@ struct server_context {
     }
 };
 
+
 // generator-like API for server responses, support pooling connection state and aggregating results
 struct server_response_reader {
     std::unordered_set<int> id_tasks;
@@ -4417,591 +4414,81 @@ struct server_response_reader {
     }
 };
 
-static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
-    // skip GH copilot requests when using default port
-    if (req.path == "/v1/health") {
-        return;
+// generator-like API for HTTP response generation
+struct server_resgen : server_http_resgen {
+    server_response_reader rd;
+    server_resgen(server_context & ctx_server_) : rd(ctx_server_) {}
+    void ok(const json & response_data) {
+        status = 200;
+        data = safe_json_to_str(response_data);
     }
-
-    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
-
-    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
-
-    SRV_DBG("request:  %s\n", req.body.c_str());
-    SRV_DBG("response: %s\n", res.body.c_str());
-}
-
-static void res_error(httplib::Response & res, const json & error_data) {
-    json final_response {{"error", error_data}};
-    res.set_content(safe_json_to_str(final_response), MIMETYPE_JSON);
-    res.status = json_value(error_data, "code", 500);
-}
-
-static void res_ok(httplib::Response & res, const json & data) {
-    res.set_content(safe_json_to_str(data), MIMETYPE_JSON);
-    res.status = 200;
-}
-
-std::function<void(int)> shutdown_handler;
-std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
-
-inline void signal_handler(int signal) {
-    if (is_terminating.test_and_set()) {
-        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
-        // this is for better developer experience, we can remove when the server is stable enough
-        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
-        exit(1);
-    }
-
-    shutdown_handler(signal);
-}
-
-int main(int argc, char ** argv) {
-    // own arguments required by this example
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
-        return 1;
-    }
-
-    // TODO: should we have a separate n_parallel parameter for the server?
-    //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
-    // TODO: this is a common configuration that is suitable for most local use cases
-    //       however, overriding the parameters is a bit confusing - figure out something more intuitive
-    if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
-        LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
-
-        params.n_parallel = 4;
-        params.kv_unified = true;
-    }
-
-    common_init();
-
-    // struct that contains llama context and inference
-    server_context ctx_server;
-
-    llama_backend_init();
-    llama_numa_init(params.numa);
-
-    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
-    LOG_INF("\n");
-    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-    LOG_INF("\n");
-
-    std::unique_ptr<httplib::Server> svr;
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
-        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
-        svr.reset(
-            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
-        );
-    } else {
-        LOG_INF("Running without SSL\n");
-        svr.reset(new httplib::Server());
-    }
-#else
-    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
-        LOG_ERR("Server is built without SSL support\n");
-        return 1;
-    }
-    svr.reset(new httplib::Server());
-#endif
-
-    std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
-
-    svr->set_default_headers({{"Server", "llama.cpp"}});
-    svr->set_logger(log_server_request);
-    svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
-        std::string message;
-        try {
-            std::rethrow_exception(ep);
-        } catch (const std::exception & e) {
-            message = e.what();
-        } catch (...) {
-            message = "Unknown Exception";
-        }
-
-        try {
-            json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
-            LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
-            res_error(res, formatted_error);
-        } catch (const std::exception & e) {
-            LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
-        }
-    });
-
-    svr->set_error_handler([](const httplib::Request &, httplib::Response & res) {
-        if (res.status == 404) {
-            res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND));
-        }
-        // for other error codes, we skip processing here because it's already done by res_error()
-    });
-
-    // set timeouts and change hostname and port
-    svr->set_read_timeout (params.timeout_read);
-    svr->set_write_timeout(params.timeout_write);
-
-    std::unordered_map<std::string, std::string> log_data;
-
-    log_data["hostname"] = params.hostname;
-    log_data["port"]     = std::to_string(params.port);
-
-    if (params.api_keys.size() == 1) {
-        auto key = params.api_keys[0];
-        log_data["api_key"] = "api_key: ****" + key.substr(std::max((int)(key.length() - 4), 0));
-    } else if (params.api_keys.size() > 1) {
-        log_data["api_key"] = "api_key: " + std::to_string(params.api_keys.size()) + " keys loaded";
+    void error(const json & error_data) {
+        status = json_value(error_data, "code", 500);
+        data = safe_json_to_str(error_data);
     }
+};
 
-    // Necessary similarity of prompt for slot selection
-    ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
-
-    //
-    // Middlewares
-    //
-
-    auto middleware_validate_api_key = [&params](const httplib::Request & req, httplib::Response & res) {
-        static const std::unordered_set<std::string> public_endpoints = {
-            "/health",
-            "/v1/health",
-            "/models",
-            "/v1/models",
-            "/api/tags"
-        };
-
-        // If API key is not set, skip validation
-        if (params.api_keys.empty()) {
-            return true;
-        }
-
-        // If path is public or is static file, skip validation
-        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
-            return true;
-        }
-
-        // Check for API key in the header
-        auto auth_header = req.get_header_value("Authorization");
-
-        std::string prefix = "Bearer ";
-        if (auth_header.substr(0, prefix.size()) == prefix) {
-            std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(params.api_keys.begin(), params.api_keys.end(), received_api_key) != params.api_keys.end()) {
-                return true; // API key is valid
-            }
-        }
-
-        // API key is invalid or not provided
-        res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
-
-        LOG_WRN("Unauthorized: Invalid API Key\n");
-
-        return false;
-    };
-
-    auto middleware_server_state = [&state](const httplib::Request & req, httplib::Response & res) {
-        server_state current_state = state.load();
-        if (current_state == SERVER_STATE_LOADING_MODEL) {
-            auto tmp = string_split<std::string>(req.path, '.');
-            if (req.path == "/" || tmp.back() == "html") {
-                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
-                res.status = 503;
-            } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
-                // allow the models endpoint to be accessed during loading
-                return true;
-            } else {
-                res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
-            }
-            return false;
-        }
-        return true;
-    };
-
-    // register server middlewares
-    svr->set_pre_routing_handler([&middleware_validate_api_key, &middleware_server_state](const httplib::Request & req, httplib::Response & res) {
-        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
-        if (req.method == "OPTIONS") {
-            res.set_header("Access-Control-Allow-Credentials", "true");
-            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
-            res.set_header("Access-Control-Allow-Headers",     "*");
-            res.set_content("", "text/html"); // blank response, no data
-            return httplib::Server::HandlerResponse::Handled; // skip further processing
-        }
-        if (!middleware_server_state(req, res)) {
-            return httplib::Server::HandlerResponse::Handled;
-        }
-        if (!middleware_validate_api_key(req, res)) {
-            return httplib::Server::HandlerResponse::Handled;
-        }
-        return httplib::Server::HandlerResponse::Unhandled;
-    });
+struct server_routes {
+    server_context & ctx_server;
+    server_routes(server_context & ctx_server_) : ctx_server(ctx_server_) {}
 
-    //
-    // Route handlers (or controllers)
-    //
+public:
+    // handlers using lambda function, so that they can capture `this` without `std::bind`
 
-    const auto handle_health = [&](const httplib::Request &, httplib::Response & res) {
+    server_http_context::handler_t get_health = [this](const server_http_request &) {
         // error and loading states are handled by middleware
-        json health = {{"status", "ok"}};
-        res_ok(res, health);
-    };
-
-    const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
-        if (!params.endpoint_slots) {
-            res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        // request slots data using task queue
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_METRICS);
-            task.id = task_id;
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
-        }
-
-        // get the result
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        // TODO: get rid of this dynamic_cast
-        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
-        GGML_ASSERT(res_task != nullptr);
-
-        // optionally return "fail_on_no_slot" error
-        if (req.has_param("fail_on_no_slot")) {
-            if (res_task->n_idle_slots == 0) {
-                res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
-                return;
-            }
-        }
-
-        res_ok(res, res_task->slots_data);
-    };
-
-    const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
-        if (!params.endpoint_metrics) {
-            res_error(res, format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        // request slots data using task queue
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_METRICS);
-            task.id = task_id;
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
-        }
-
-        // get the result
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        // TODO: get rid of this dynamic_cast
-        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
-        GGML_ASSERT(res_task != nullptr);
-
-        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
-        json all_metrics_def = json {
-            {"counter", {{
-                    {"name",  "prompt_tokens_total"},
-                    {"help",  "Number of prompt tokens processed."},
-                    {"value",  (uint64_t) res_task->n_prompt_tokens_processed_total}
-            }, {
-                    {"name",  "prompt_seconds_total"},
-                    {"help",  "Prompt process time"},
-                    {"value",  (uint64_t) res_task->t_prompt_processing_total / 1.e3}
-            }, {
-                    {"name",  "tokens_predicted_total"},
-                    {"help",  "Number of generation tokens processed."},
-                    {"value",  (uint64_t) res_task->n_tokens_predicted_total}
-            }, {
-                    {"name",  "tokens_predicted_seconds_total"},
-                    {"help",  "Predict process time"},
-                    {"value",  (uint64_t) res_task->t_tokens_generation_total / 1.e3}
-            }, {
-                    {"name",  "n_decode_total"},
-                    {"help",  "Total number of llama_decode() calls"},
-                    {"value",  res_task->n_decode_total}
-            }, {
-                    {"name",  "n_tokens_max"},
-                    {"help",  "Largest observed n_tokens."},
-                    {"value",  res_task->n_tokens_max}
-            }, {
-                    {"name",  "n_busy_slots_per_decode"},
-                    {"help",  "Average number of busy slots per llama_decode() call"},
-                    {"value",  (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)}
-            }}},
-            {"gauge", {{
-                    {"name",  "prompt_tokens_seconds"},
-                    {"help",  "Average prompt throughput in tokens/s."},
-                    {"value",  res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.}
-            },{
-                    {"name",  "predicted_tokens_seconds"},
-                    {"help",  "Average generation throughput in tokens/s."},
-                    {"value",  res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.}
-            },{
-                    {"name",  "requests_processing"},
-                    {"help",  "Number of requests processing."},
-                    {"value",  (uint64_t) res_task->n_processing_slots}
-            },{
-                    {"name",  "requests_deferred"},
-                    {"help",  "Number of requests deferred."},
-                    {"value",  (uint64_t) res_task->n_tasks_deferred}
-            }}}
-        };
-
-        std::stringstream prometheus;
-
-        for (const auto & el : all_metrics_def.items()) {
-            const auto & type        = el.key();
-            const auto & metrics_def = el.value();
-
-            for (const auto & metric_def : metrics_def) {
-                const std::string name = metric_def.at("name");
-                const std::string help = metric_def.at("help");
-
-                auto value = json_value(metric_def, "value", 0.);
-                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
-                            << "# TYPE llamacpp:" << name << " " << type  << "\n"
-                            << "llamacpp:"        << name << " " << value << "\n";
-            }
-        }
-
-        res.set_header("Process-Start-Time-Unix", std::to_string(res_task->t_start));
-
-        res.set_content(prometheus.str(), "text/plain; version=0.0.4");
-        res.status = 200; // HTTP OK
-    };
-
-    const auto handle_slots_save = [&ctx_server, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
-        json request_data = json::parse(req.body);
-        std::string filename = request_data.at("filename");
-        if (!fs_validate_filename(filename)) {
-            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-        std::string filepath = params.slot_save_path + filename;
-
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
-            task.id = task_id;
-            task.slot_action.slot_id  = id_slot;
-            task.slot_action.filename = filename;
-            task.slot_action.filepath = filepath;
-
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task));
-        }
-
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        res_ok(res, result->to_json());
-    };
-
-    const auto handle_slots_restore = [&ctx_server, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
-        json request_data = json::parse(req.body);
-        std::string filename = request_data.at("filename");
-        if (!fs_validate_filename(filename)) {
-            res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-        std::string filepath = params.slot_save_path + filename;
-
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
-            task.id = task_id;
-            task.slot_action.slot_id  = id_slot;
-            task.slot_action.filename = filename;
-            task.slot_action.filepath = filepath;
-
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task));
-        }
-
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        GGML_ASSERT(dynamic_cast<server_task_result_slot_save_load*>(result.get()) != nullptr);
-        res_ok(res, result->to_json());
-    };
-
-    const auto handle_slots_erase = [&ctx_server](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
-            task.id = task_id;
-            task.slot_action.slot_id = id_slot;
-
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task));
-        }
-
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
-
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
-
-        GGML_ASSERT(dynamic_cast<server_task_result_slot_erase*>(result.get()) != nullptr);
-        res_ok(res, result->to_json());
-    };
-
-    const auto handle_slots_action = [&params, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
-        if (params.slot_save_path.empty()) {
-            res_error(res, format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        std::string id_slot_str = req.path_params.at("id_slot");
-        int id_slot;
-
-        try {
-            id_slot = std::stoi(id_slot_str);
-        } catch (const std::exception &) {
-            res_error(res, format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        std::string action = req.get_param_value("action");
-
-        if (action == "save") {
-            handle_slots_save(req, res, id_slot);
-        } else if (action == "restore") {
-            handle_slots_restore(req, res, id_slot);
-        } else if (action == "erase") {
-            handle_slots_erase(req, res, id_slot);
-        } else {
-            res_error(res, format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
-        }
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        res->ok({{"status", "ok"}});
+        return res;
     };
 
-    const auto handle_props = [&params, &ctx_server](const httplib::Request &, httplib::Response & res) {
-        json default_generation_settings_for_props;
-
-        {
-            slot_params params;
-
-            params.sampling = ctx_server.params_base.sampling;
-
-            default_generation_settings_for_props = json {
-                {"params", params.to_json(true)},
-                {"n_ctx",  ctx_server.slots[0].n_ctx},
-            };
-        }
-
-        // this endpoint is publicly available, please only return what is safe to be exposed
-        json data = {
-            { "default_generation_settings", default_generation_settings_for_props },
-            { "total_slots",                 ctx_server.params_base.n_parallel },
-            { "model_alias",                 ctx_server.params_base.model_alias },
-            { "model_path",                  ctx_server.params_base.model.path },
-            { "modalities",                  json {
-                {"vision", ctx_server.oai_parser_opt.allow_image},
-                {"audio",  ctx_server.oai_parser_opt.allow_audio},
-            } },
-            { "endpoint_slots",              params.endpoint_slots },
-            { "endpoint_props",              params.endpoint_props },
-            { "endpoint_metrics",            params.endpoint_metrics },
-            { "webui",                       params.webui },
-            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
-            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
-            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
-            { "build_info",                  build_info },
-        };
-        if (ctx_server.params_base.use_jinja) {
-            if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
-                data["chat_template_tool_use"] = tool_use_src;
-            }
-        }
-
-        res_ok(res, data);
+    server_http_context::handler_t post_completions = [this](const server_http_request & req) {
+        std::vector<raw_buffer> files; // dummy
+        return handle_completions_impl(
+            SERVER_TASK_TYPE_COMPLETION,
+            req.body,
+            files,
+            req.should_stop,
+            OAICOMPAT_TYPE_NONE);
     };
 
-    const auto handle_props_change = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params_base.endpoint_props) {
-            res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        json data = json::parse(req.body);
-
-        // update any props here
-
-        res_ok(res, {{ "success", true }});
+    server_http_context::handler_t post_completions_oai = [this](const server_http_request & req) {
+        std::vector<raw_buffer> files; // dummy
+        return handle_completions_impl(
+            SERVER_TASK_TYPE_COMPLETION,
+            req.body,
+            files,
+            req.should_stop,
+            OAICOMPAT_TYPE_COMPLETION);
     };
 
-    const auto handle_api_show = [&ctx_server](const httplib::Request &, httplib::Response & res) {
-        bool has_mtmd = ctx_server.mctx != nullptr;
-        json data = {
-            {
-                "template", common_chat_templates_source(ctx_server.chat_templates.get()),
-            },
-            {
-                "model_info", {
-                    { "llama.context_length", ctx_server.slots.back().n_ctx, },
-                }
-            },
-            {"modelfile", ""},
-            {"parameters", ""},
-            {"template", common_chat_templates_source(ctx_server.chat_templates.get())},
-            {"details", {
-                {"parent_model", ""},
-                {"format", "gguf"},
-                {"family", ""},
-                {"families", {""}},
-                {"parameter_size", ""},
-                {"quantization_level", ""}
-            }},
-            {"model_info", ""},
-            {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
-        };
-
-        res_ok(res, data);
+    server_http_context::handler_t post_chat_completions = [this](const server_http_request & req) {
+        std::vector<raw_buffer> files;
+        json body_cpy = req.body;
+        json data = oaicompat_chat_params_parse(
+            body_cpy,
+            ctx_server.oai_parser_opt,
+            files);
+        return handle_completions_impl(
+            SERVER_TASK_TYPE_COMPLETION,
+            body_cpy,
+            files,
+            req.should_stop,
+            OAICOMPAT_TYPE_CHAT);
     };
 
-    // handle completion-like requests (completion, chat, infill)
-    // we can optionally provide a custom format for partial results and final results
-    const auto handle_completions_impl = [&ctx_server](
-            server_task_type type,
-            json & data,
-            const std::vector<raw_buffer> & files,
-            const std::function<bool()> & is_connection_closed,
-            httplib::Response & res,
-            oaicompat_type oaicompat) -> void {
+private:
+    server_http_resgen_ptr handle_completions_impl(
+                server_task_type type,
+                const json & data,
+                const std::vector<raw_buffer> & files,
+                const std::function<bool()> & should_stop,
+                oaicompat_type oaicompat) {
         GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
 
+        auto res = std::make_unique<server_resgen>(ctx_server);
         auto completion_id = gen_chatcmplid();
-        // need to store the reader as a pointer, so that it won't be destroyed when the handle returns
-        // use shared_ptr as it's shared between the chunked_content_provider() and on_complete()
-        const auto rd = std::make_shared<server_response_reader>(ctx_server);
+        auto & rd = res->rd;
 
         try {
             std::vector<server_task> tasks;
@@ -5042,22 +4529,22 @@ int main(int argc, char ** argv) {
                 tasks.push_back(std::move(task));
             }
 
-            rd->post_tasks(std::move(tasks));
+            rd.post_tasks(std::move(tasks));
         } catch (const std::exception & e) {
-            res_error(res, format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
-            return;
+            res->error(format_error_response(e.what(), ERROR_TYPE_INVALID_REQUEST));
+            return res;
         }
 
         bool stream = json_value(data, "stream", false);
 
         if (!stream) {
             // non-stream, wait for the results
-            auto all_results = rd->wait_for_all(is_connection_closed);
+            auto all_results = rd.wait_for_all(should_stop);
             if (all_results.is_terminated) {
-                return; // connection is closed
+                return res; // connection is closed
             } else if (all_results.error) {
-                res_error(res, all_results.error->to_json());
-                return;
+                res->error(all_results.error->to_json());
+                return res;
             } else {
                 json arr = json::array();
                 for (auto & res : all_results.results) {
@@ -5065,19 +4552,19 @@ int main(int argc, char ** argv) {
                     arr.push_back(res->to_json());
                 }
                 // if single request, return single object instead of array
-                res_ok(res, arr.size() == 1 ? arr[0] : arr);
+                res->ok(arr.size() == 1 ? arr[0] : arr);
             }
 
         } else {
             // in streaming mode, the first error must be treated as non-stream response
             // this is to match the OAI API behavior
             // ref: https://github.com/ggml-org/llama.cpp/pull/16486#discussion_r2419657309
-            server_task_result_ptr first_result = rd->next(is_connection_closed);
+            server_task_result_ptr first_result = rd.next(should_stop);
             if (first_result == nullptr) {
-                return; // connection is closed
+                return res; // connection is closed
             } else if (first_result->is_error()) {
-                res_error(res, first_result->to_json());
-                return;
+                res->error(first_result->to_json());
+                return res;
             } else {
                 GGML_ASSERT(
                     dynamic_cast<server_task_result_cmpl_partial*>(first_result.get()) != nullptr
@@ -5086,656 +4573,138 @@ int main(int argc, char ** argv) {
             }
 
             // next responses are streamed
-            json first_result_json = first_result->to_json();
-            const auto chunked_content_provider = [first_result_json, rd, oaicompat](size_t, httplib::DataSink & sink) mutable -> bool {
-                // flush the first result as it's not an error
-                if (!first_result_json.empty()) {
-                    if (!server_sent_event(sink, first_result_json)) {
-                        sink.done();
-                        return false; // sending failed, go to on_complete()
-                    }
-                    first_result_json.clear(); // mark as sent
+            res->ok(first_result->to_json());
+            res->content_type = "text/event-stream";
+            res->next = [res_this = res.get(), oaicompat, &should_stop]() mutable -> bool {
+                if (should_stop()) {
+                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                    return false; // should_stop condition met
                 }
 
+                server_response_reader & rd = res_this->rd;
+                std::string & output = res_this->data;
+                output.clear();
+
                 // receive subsequent results
-                auto result = rd->next([&sink]{ return !sink.is_writable(); });
+                auto result = rd.next(should_stop);
                 if (result == nullptr) {
-                    sink.done();
-                    return false; // connection is closed, go to on_complete()
+                    SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
+                    return false; // should_stop condition met
                 }
 
                 // send the results
                 json res_json = result->to_json();
-                bool ok = false;
                 if (result->is_error()) {
-                    ok = server_sent_event(sink, json {{ "error", result->to_json() }});
-                    sink.done();
-                    return false; // go to on_complete()
+                    output += format_sse(res_json);
+                    SRV_DBG("%s", "error received during streaming, terminating stream\n");
+                    return false; // terminate on error
                 } else {
                     GGML_ASSERT(
                         dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
                         || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
                     );
-                    ok = server_sent_event(sink, res_json);
-                }
-
-                if (!ok) {
-                    sink.done();
-                    return false; // sending failed, go to on_complete()
+                    output += format_sse(res_json);
                 }
 
                 // check if there is more data
-                if (!rd->has_next()) {
+                if (!rd.has_next()) {
                     if (oaicompat != OAICOMPAT_TYPE_NONE) {
-                        static const std::string ev_done = "data: [DONE]\n\n";
-                        sink.write(ev_done.data(), ev_done.size());
+                        output += "data: [DONE]\n\n";
                     }
-                    sink.done();
-                    return false; // no more data, go to on_complete()
+                    SRV_DBG("%s", "all results received, terminating stream\n");
+                    return false; // no more data, terminate
                 }
 
                 // has next data, continue
                 return true;
             };
-
-            auto on_complete = [rd](bool) {
-                rd->stop();
-            };
-
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-        }
-    };
-
-    const auto handle_completions = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        json data = json::parse(req.body);
-        std::vector<raw_buffer> files; // dummy
-        handle_completions_impl(
-            SERVER_TASK_TYPE_COMPLETION,
-            data,
-            files,
-            req.is_connection_closed,
-            res,
-            OAICOMPAT_TYPE_NONE);
-    };
-
-    const auto handle_completions_oai = [&handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        json data = oaicompat_completion_params_parse(json::parse(req.body));
-        std::vector<raw_buffer> files; // dummy
-        handle_completions_impl(
-            SERVER_TASK_TYPE_COMPLETION,
-            data,
-            files,
-            req.is_connection_closed,
-            res,
-            OAICOMPAT_TYPE_COMPLETION);
-    };
-
-    const auto handle_infill = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        // check model compatibility
-        std::string err;
-        if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
-            err += "prefix token is missing. ";
-        }
-        if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
-            err += "suffix token is missing. ";
-        }
-        if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
-            err += "middle token is missing. ";
-        }
-        if (!err.empty()) {
-            res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        json data = json::parse(req.body);
-
-        // validate input
-        if (data.contains("prompt") && !data.at("prompt").is_string()) {
-            // prompt is optional
-            res_error(res, format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST));
-        }
-
-        if (!data.contains("input_prefix")) {
-            res_error(res, format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
-        }
-
-        if (!data.contains("input_suffix")) {
-            res_error(res, format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST));
-        }
-
-        if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
-            // input_extra is optional
-            res_error(res, format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        json input_extra = json_value(data, "input_extra", json::array());
-        for (const auto & chunk : input_extra) {
-            // { "text": string, "filename": string }
-            if (!chunk.contains("text") || !chunk.at("text").is_string()) {
-                res_error(res, format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-            // filename is optional
-            if (chunk.contains("filename") && !chunk.at("filename").is_string()) {
-                res_error(res, format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-        }
-        data["input_extra"] = input_extra; // default to empty array if it's not exist
-
-        std::string prompt = json_value(data, "prompt", std::string());
-        std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
-        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
-        data["prompt"] = format_infill(
-            ctx_server.vocab,
-            data.at("input_prefix"),
-            data.at("input_suffix"),
-            data.at("input_extra"),
-            ctx_server.params_base.n_batch,
-            ctx_server.params_base.n_predict,
-            ctx_server.slots[0].n_ctx, // TODO: there should be a better way
-            ctx_server.params_base.spm_infill,
-            tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
-        );
-
-        std::vector<raw_buffer> files; // dummy
-        handle_completions_impl(
-            SERVER_TASK_TYPE_INFILL,
-            data,
-            files,
-            req.is_connection_closed,
-            res,
-            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
-    };
-
-    const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
-        LOG_DBG("request: %s\n", req.body.c_str());
-
-        auto body = json::parse(req.body);
-        std::vector<raw_buffer> files;
-        json data = oaicompat_chat_params_parse(
-            body,
-            ctx_server.oai_parser_opt,
-            files);
-
-        handle_completions_impl(
-            SERVER_TASK_TYPE_COMPLETION,
-            data,
-            files,
-            req.is_connection_closed,
-            res,
-            OAICOMPAT_TYPE_CHAT);
-    };
-
-    // same with handle_chat_completions, but without inference part
-    const auto handle_apply_template = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        auto body = json::parse(req.body);
-        std::vector<raw_buffer> files; // dummy, unused
-        json data = oaicompat_chat_params_parse(
-            body,
-            ctx_server.oai_parser_opt,
-            files);
-        res_ok(res, {{ "prompt", std::move(data.at("prompt")) }});
-    };
-
-    const auto handle_models = [&params, &ctx_server, &state](const httplib::Request &, httplib::Response & res) {
-        server_state current_state = state.load();
-        json model_meta = nullptr;
-        if (current_state == SERVER_STATE_READY) {
-            model_meta = ctx_server.model_meta();
         }
-        bool has_mtmd = ctx_server.mctx != nullptr;
-        json models = {
-            {"models", {
-                {
-                    {"name", params.model_alias.empty() ? params.model.path : params.model_alias},
-                    {"model", params.model_alias.empty() ? params.model.path : params.model_alias},
-                    {"modified_at", ""},
-                    {"size", ""},
-                    {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
-                    {"type", "model"},
-                    {"description", ""},
-                    {"tags", {""}},
-                    {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})},
-                    {"parameters", ""},
-                    {"details", {
-                        {"parent_model", ""},
-                        {"format", "gguf"},
-                        {"family", ""},
-                        {"families", {""}},
-                        {"parameter_size", ""},
-                        {"quantization_level", ""}
-                    }}
-                }
-            }},
-            {"object", "list"},
-            {"data", {
-                {
-                    {"id",       params.model_alias.empty() ? params.model.path : params.model_alias},
-                    {"object",   "model"},
-                    {"created",  std::time(0)},
-                    {"owned_by", "llamacpp"},
-                    {"meta",     model_meta},
-                },
-            }}
-        };
-
-        res_ok(res, models);
-    };
 
-    const auto handle_tokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        const json body = json::parse(req.body);
-
-        json tokens_response = json::array();
-        if (body.count("content") != 0) {
-            const bool add_special = json_value(body, "add_special", false);
-            const bool parse_special = json_value(body, "parse_special", true);
-            const bool with_pieces = json_value(body, "with_pieces", false);
-
-            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special);
-
-            if (with_pieces) {
-                for (const auto& token : tokens) {
-                    std::string piece = common_token_to_piece(ctx_server.ctx, token);
-                    json piece_json;
-
-                    // Check if the piece is valid UTF-8
-                    if (is_valid_utf8(piece)) {
-                        piece_json = piece;
-                    } else {
-                        // If not valid UTF-8, store as array of byte values
-                        piece_json = json::array();
-                        for (unsigned char c : piece) {
-                            piece_json.push_back(static_cast<int>(c));
-                        }
-                    }
-
-                    tokens_response.push_back({
-                        {"id", token},
-                        {"piece", piece_json}
-                    });
-                }
-            } else {
-                tokens_response = tokens;
-            }
-        }
-
-        const json data = format_tokenizer_response(tokens_response);
-        res_ok(res, data);
-    };
-
-    const auto handle_detokenize = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        const json body = json::parse(req.body);
-
-        std::string content;
-        if (body.count("tokens") != 0) {
-            const llama_tokens tokens = body.at("tokens");
-            content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
-        }
-
-        const json data = format_detokenized_response(content);
-        res_ok(res, data);
-    };
-
-    const auto handle_embeddings_impl = [&ctx_server](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
-        if (!ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
-            res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        const json body = json::parse(req.body);
-
-        // for the shape of input/content, see tokenize_input_prompts()
-        json prompt;
-        if (body.count("input") != 0) {
-            prompt = body.at("input");
-        } else if (body.contains("content")) {
-            oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
-            prompt = body.at("content");
-        } else {
-            res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
-
-        bool use_base64 = false;
-        if (body.count("encoding_format") != 0) {
-            const std::string& format = body.at("encoding_format");
-            if (format == "base64") {
-                use_base64 = true;
-            } else if (format != "float") {
-                res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-        }
-
-        auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
-        for (const auto & tokens : tokenized_prompts) {
-            // this check is necessary for models that do not add BOS token to the input
-            if (tokens.empty()) {
-                res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-        }
-
-        int embd_normalize = 2; // default to Euclidean/L2 norm
-        if (body.count("embd_normalize") != 0) {
-            embd_normalize = body.at("embd_normalize");
-            if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
-                SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx));
-            }
-        }
-
-        // create and queue the task
-        json responses = json::array();
-        server_response_reader rd(ctx_server);
-        {
-            std::vector<server_task> tasks;
-            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
-                server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
-
-                task.id     = ctx_server.queue_tasks.get_new_id();
-                task.index  = i;
-                task.tokens = std::move(tokenized_prompts[i]);
-
-                // OAI-compat
-                task.params.oaicompat = oaicompat;
-                task.params.embd_normalize = embd_normalize;
-
-                tasks.push_back(std::move(task));
-            }
-            rd.post_tasks(std::move(tasks));
-        }
-
-        // wait for the results
-        auto all_results = rd.wait_for_all(req.is_connection_closed);
-
-        // collect results
-        if (all_results.is_terminated) {
-            return; // connection is closed
-        } else if (all_results.error) {
-            res_error(res, all_results.error->to_json());
-            return;
-        } else {
-            for (auto & res : all_results.results) {
-                GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
-                responses.push_back(res->to_json());
-            }
-        }
-
-        // write JSON response
-        json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
-            ? format_embeddings_response_oaicompat(body, responses, use_base64)
-            : json(responses);
-        res_ok(res, root);
-    };
-
-    const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
-        handle_embeddings_impl(req, res, OAICOMPAT_TYPE_NONE);
-    };
-
-    const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) {
-        handle_embeddings_impl(req, res, OAICOMPAT_TYPE_EMBEDDING);
-    };
-
-    const auto handle_rerank = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
-            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
-        const json body = json::parse(req.body);
-
-        // if true, use TEI API format, otherwise use Jina API format
-        // Jina: https://jina.ai/reranker/
-        // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
-        bool is_tei_format = body.contains("texts");
-
-        json query;
-        if (body.count("query") == 1) {
-            query = body.at("query");
-            if (!query.is_string()) {
-                res_error(res, format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST));
-                return;
-            }
-        } else {
-            res_error(res, format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
+        return res;
+    }
+};
 
-        std::vector<std::string> documents = json_value(body, "documents",
-                                             json_value(body, "texts", std::vector<std::string>()));
-        if (documents.empty()) {
-            res_error(res, format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
+std::function<void(int)> shutdown_handler;
+std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 
-        int top_n = json_value(body, "top_n", (int)documents.size());
+inline void signal_handler(int signal) {
+    if (is_terminating.test_and_set()) {
+        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
+        // this is for better developer experience, we can remove when the server is stable enough
+        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
+        exit(1);
+    }
 
-        // create and queue the task
-        json responses = json::array();
-        server_response_reader rd(ctx_server);
-        {
-            std::vector<server_task> tasks;
-            tasks.reserve(documents.size());
-            for (size_t i = 0; i < documents.size(); i++) {
-                auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
-                server_task task = server_task(SERVER_TASK_TYPE_RERANK);
-                task.id     = ctx_server.queue_tasks.get_new_id();
-                task.index  = i;
-                task.tokens = std::move(tmp);
-                tasks.push_back(std::move(task));
-            }
-            rd.post_tasks(std::move(tasks));
-        }
+    shutdown_handler(signal);
+}
 
-        // wait for the results
-        auto all_results = rd.wait_for_all(req.is_connection_closed);
+int main(int argc, char ** argv) {
+    // own arguments required by this example
+    common_params params;
 
-        // collect results
-        if (all_results.is_terminated) {
-            return; // connection is closed
-        } else if (all_results.error) {
-            res_error(res, all_results.error->to_json());
-            return;
-        } else {
-            for (auto & res : all_results.results) {
-                GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
-                responses.push_back(res->to_json());
-            }
-        }
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
+        return 1;
+    }
 
-        // write JSON response
-        json root = format_response_rerank(
-            body,
-            responses,
-            is_tei_format,
-            documents,
-            top_n);
+    // TODO: should we have a separate n_parallel parameter for the server?
+    //       https://github.com/ggml-org/llama.cpp/pull/16736#discussion_r2483763177
+    // TODO: this is a common configuration that is suitable for most local use cases
+    //       however, overriding the parameters is a bit confusing - figure out something more intuitive
+    if (params.n_parallel == 1 && params.kv_unified == false && !params.has_speculative()) {
+        LOG_WRN("%s: setting n_parallel = 4 and kv_unified = true (add -kvu to disable this)\n", __func__);
 
-        res_ok(res, root);
-    };
+        params.n_parallel = 4;
+        params.kv_unified = true;
+    }
 
-    const auto handle_lora_adapters_list = [&](const httplib::Request &, httplib::Response & res) {
-        json result = json::array();
-        const auto & loras = ctx_server.params_base.lora_adapters;
-        for (size_t i = 0; i < loras.size(); ++i) {
-            auto & lora = loras[i];
-            json entry = {
-                {"id", i},
-                {"path", lora.path},
-                {"scale", lora.scale},
-                {"task_name", lora.task_name},
-                {"prompt_prefix", lora.prompt_prefix},
-            };
-            std::string alora_invocation_string = "";
-            const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr);
-            std::vector<llama_token> alora_invocation_tokens;
-            if (n_alora_tokens) {
-                const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr);
-                for (uint64_t i = 0; i < n_alora_tokens; ++i) {
-                    alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]);
-                    alora_invocation_tokens.push_back(alora_tokens[i]);
-                }
-                entry["alora_invocation_string"] = alora_invocation_string;
-                entry["alora_invocation_tokens"] = alora_invocation_tokens;
-            }
-            result.push_back(std::move(entry));
-        }
-        res_ok(res, result);
-        res.status = 200; // HTTP OK
-    };
+    common_init();
 
-    const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
-        const json body = json::parse(req.body);
-        if (!body.is_array()) {
-            res_error(res, format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
-            return;
-        }
+    // struct that contains llama context and inference
+    server_context ctx_server;
 
-        int task_id = ctx_server.queue_tasks.get_new_id();
-        {
-            server_task task(SERVER_TASK_TYPE_SET_LORA);
-            task.id = task_id;
-            task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
-            ctx_server.queue_results.add_waiting_task_id(task_id);
-            ctx_server.queue_tasks.post(std::move(task));
-        }
+    // Necessary similarity of prompt for slot selection
+    ctx_server.slot_prompt_similarity = params.slot_prompt_similarity;
 
-        // get the result
-        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
-        ctx_server.queue_results.remove_waiting_task_id(task_id);
+    llama_backend_init();
+    llama_numa_init(params.numa);
 
-        if (result->is_error()) {
-            res_error(res, result->to_json());
-            return;
-        }
+    LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
+    LOG_INF("\n");
+    LOG_INF("%s\n", common_params_get_system_info(params).c_str());
+    LOG_INF("\n");
 
-        GGML_ASSERT(dynamic_cast<server_task_result_apply_lora*>(result.get()) != nullptr);
-        res_ok(res, result->to_json());
-    };
+    server_http_context ctx_http;
+    if (!ctx_http.init(params)) {
+        LOG_ERR("%s: failed to initialize HTTP server\n", __func__);
+        return 1;
+    }
 
     //
     // Router
     //
 
-    if (!params.webui) {
-        LOG_INF("Web UI is disabled\n");
-    } else {
-        // register static assets routes
-        if (!params.public_path.empty()) {
-            // Set the base directory for serving static files
-            bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path);
-            if (!is_found) {
-                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
-                return 1;
-            }
-        } else {
-            // using embedded static index.html
-            svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
-                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
-                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
-                } else {
-                    res.set_header("Content-Encoding", "gzip");
-                    // COEP and COOP headers, required by pyodide (python interpreter)
-                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
-                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
-                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
-                }
-                return false;
-            });
-        }
-    }
-
     // register API routes
-    svr->Get (params.api_prefix + "/health",              handle_health); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/v1/health",           handle_health); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/metrics",             handle_metrics);
-    svr->Get (params.api_prefix + "/props",               handle_props);
-    svr->Post(params.api_prefix + "/props",               handle_props_change);
-    svr->Post(params.api_prefix + "/api/show",            handle_api_show);
-    svr->Get (params.api_prefix + "/models",              handle_models); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/v1/models",           handle_models); // public endpoint (no API key check)
-    svr->Get (params.api_prefix + "/api/tags",            handle_models); // ollama specific endpoint. public endpoint (no API key check)
-    svr->Post(params.api_prefix + "/completion",          handle_completions); // legacy
-    svr->Post(params.api_prefix + "/completions",         handle_completions);
-    svr->Post(params.api_prefix + "/v1/completions",      handle_completions_oai);
-    svr->Post(params.api_prefix + "/chat/completions",    handle_chat_completions);
-    svr->Post(params.api_prefix + "/v1/chat/completions", handle_chat_completions);
-    svr->Post(params.api_prefix + "/api/chat",            handle_chat_completions); // ollama specific endpoint
-    svr->Post(params.api_prefix + "/infill",              handle_infill);
-    svr->Post(params.api_prefix + "/embedding",           handle_embeddings); // legacy
-    svr->Post(params.api_prefix + "/embeddings",          handle_embeddings);
-    svr->Post(params.api_prefix + "/v1/embeddings",       handle_embeddings_oai);
-    svr->Post(params.api_prefix + "/rerank",              handle_rerank);
-    svr->Post(params.api_prefix + "/reranking",           handle_rerank);
-    svr->Post(params.api_prefix + "/v1/rerank",           handle_rerank);
-    svr->Post(params.api_prefix + "/v1/reranking",        handle_rerank);
-    svr->Post(params.api_prefix + "/tokenize",            handle_tokenize);
-    svr->Post(params.api_prefix + "/detokenize",          handle_detokenize);
-    svr->Post(params.api_prefix + "/apply-template",      handle_apply_template);
-    // LoRA adapters hotswap
-    svr->Get (params.api_prefix + "/lora-adapters",       handle_lora_adapters_list);
-    svr->Post(params.api_prefix + "/lora-adapters",       handle_lora_adapters_apply);
-    // Save & load slots
-    svr->Get (params.api_prefix + "/slots",               handle_slots);
-    svr->Post(params.api_prefix + "/slots/:id_slot",      handle_slots_action);
+    server_routes routes(ctx_server);
+    ctx_http.get ("/health",              routes.get_health);
+    ctx_http.post("/completion",          routes.post_completions); // legacy
+    ctx_http.post("/completions",         routes.post_completions);
+    ctx_http.post("/v1/completions",      routes.post_completions_oai);
+    ctx_http.post("/chat/completions",    routes.post_chat_completions);
+    ctx_http.post("/v1/chat/completions", routes.post_chat_completions);
+    ctx_http.post("/api/chat",            routes.post_chat_completions); // ollama specific endpoint
 
     //
     // Start the server
     //
-    if (params.n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        params.n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    log_data["n_threads_http"] =  std::to_string(params.n_threads_http);
-    svr->new_task_queue = [&params] { return new httplib::ThreadPool(params.n_threads_http); };
 
-    // clean up function, to be called before exit
-    auto clean_up = [&svr, &ctx_server]() {
+    // setup clean up function, to be called before exit
+    auto clean_up = [&ctx_http, &ctx_server]() {
         SRV_INF("%s: cleaning up before exit...\n", __func__);
-        svr->stop();
+        ctx_http.stop();
         ctx_server.queue_results.terminate();
         llama_backend_free();
     };
 
-    bool was_bound = false;
-    bool is_sock = false;
-    if (string_ends_with(std::string(params.hostname), ".sock")) {
-        is_sock = true;
-        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
-        svr->set_address_family(AF_UNIX);
-        // bind_to_port requires a second arg, any value other than 0 should
-        // simply get ignored
-        was_bound = svr->bind_to_port(params.hostname, 8080);
-    } else {
-        LOG_INF("%s: binding port with default address family\n", __func__);
-        // bind HTTP listen port
-        if (params.port == 0) {
-            int bound_port = svr->bind_to_any_port(params.hostname);
-            if ((was_bound = (bound_port >= 0))) {
-                params.port = bound_port;
-            }
-        } else {
-            was_bound = svr->bind_to_port(params.hostname, params.port);
-        }
-    }
-
-    if (!was_bound) {
-        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, params.hostname.c_str(), params.port);
-        clean_up();
-        return 1;
-    }
-
-    // run the HTTP server in a thread
-    std::thread t([&]() { svr->listen_after_bind(); });
-    svr->wait_until_ready();
-
     LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
 
     // load the model
@@ -5743,13 +4712,13 @@ int main(int argc, char ** argv) {
 
     if (!ctx_server.load_model(params)) {
         clean_up();
-        t.join();
+        ctx_http.thread.join();
         LOG_ERR("%s: exiting due to model loading error\n", __func__);
         return 1;
     }
 
     ctx_server.init();
-    state.store(SERVER_STATE_READY);
+    ctx_http.is_ready.store(true);
 
     LOG_INF("%s: model loaded\n", __func__);
 
@@ -5785,15 +4754,18 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-    LOG_INF("%s: server is listening on %s - starting the main loop\n", __func__,
-            is_sock ? string_format("unix://%s",    params.hostname.c_str()).c_str() :
-                      string_format("http://%s:%d", params.hostname.c_str(), params.port).c_str());
+    if (!ctx_http.start()) {
+        clean_up();
+        LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+        return 1;
+    }
 
+    LOG_INF("%s: starting the main loop...\n", __func__);
     // this call blocks the main thread until queue_tasks.terminate() is called
     ctx_server.queue_tasks.start_loop();
 
     clean_up();
-    t.join();
+    ctx_http.thread.join();
     llama_memory_breakdown_print(ctx_server.ctx);
 
     return 0;
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index b1ecc5af5ed0a..2b20cf0b87c99 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -453,29 +453,25 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
     return out;
 }
 
+// format server-sent event (SSE), return the formatted string to send
 // note: if data is a json array, it will be sent as multiple events, one per item
-static bool server_sent_event(httplib::DataSink & sink, const json & data) {
-    static auto send_single = [](httplib::DataSink & sink, const json & data) -> bool {
-        const std::string str =
-            "data: " +
-            data.dump(-1, ' ', false, json::error_handler_t::replace) +
+static std::string format_sse(const json & data) {
+    std::ostringstream ss;
+    auto send_single = [&ss](const json & data) {
+        ss << "data: " <<
+            data.dump(-1, ' ', false, json::error_handler_t::replace) <<
             "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
-
-        LOG_DBG("data stream, to_send: %s", str.c_str());
-        return sink.write(str.c_str(), str.size());
     };
 
     if (data.is_array()) {
         for (const auto & item : data) {
-            if (!send_single(sink, item)) {
-                return false;
-            }
+            send_single(item);
         }
     } else {
-        return send_single(sink, data);
+        send_single(data);
     }
 
-    return true;
+    return ss.str();
 }
 
 //

From fe98058fd88caf0d0d3c627622ae4e5567689c7c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 12 Nov 2025 21:00:15 +0100
Subject: [PATCH 02/21] move server-http and httplib to its own file

---
 tools/server/CMakeLists.txt  |   2 +
 tools/server/server-http.cpp | 354 +++++++++++++++++++++++++++++++++++
 tools/server/server-http.h   | 335 +++------------------------------
 tools/server/utils.hpp       |   5 +-
 4 files changed, 380 insertions(+), 316 deletions(-)
 create mode 100644 tools/server/server-http.cpp

diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt
index c801e84c3d415..1fccfdd17f138 100644
--- a/tools/server/CMakeLists.txt
+++ b/tools/server/CMakeLists.txt
@@ -14,6 +14,8 @@ endif()
 set(TARGET_SRCS
     server.cpp
     utils.hpp
+    server-http.cpp
+    server-http.h
 )
 set(PUBLIC_ASSETS
     index.html.gz
diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
new file mode 100644
index 0000000000000..0324e8d69f96c
--- /dev/null
+++ b/tools/server/server-http.cpp
@@ -0,0 +1,354 @@
+#include "utils.hpp"
+#include "common.h"
+#include "server-http.h"
+
+#include <cpp-httplib/httplib.h>
+
+#include <functional>
+#include <string>
+#include <thread>
+
+// auto generated files (see README.md for details)
+#include "index.html.gz.hpp"
+#include "loading.html.hpp"
+
+//
+// HTTP implementation using cpp-httplib
+//
+
+class server_http_context::Impl {
+public:
+    std::unique_ptr<httplib::Server> srv;
+};
+
+server_http_context::server_http_context()
+    : pimpl(std::make_unique<server_http_context::Impl>())
+{}
+
+server_http_context::~server_http_context() = default;
+
+static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
+    // skip GH copilot requests when using default port
+    if (req.path == "/v1/health") {
+        return;
+    }
+
+    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
+
+    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
+
+    SRV_DBG("request:  %s\n", req.body.c_str());
+    SRV_DBG("response: %s\n", res.body.c_str());
+}
+
+bool server_http_context::init(const common_params & params) {
+    path_prefix = params.api_prefix;
+    port = params.port;
+    hostname = params.hostname;
+
+#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
+        svr.reset(
+            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
+        );
+    } else {
+        LOG_INF("Running without SSL\n");
+        svr.reset(new httplib::Server());
+    }
+#else
+    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
+        LOG_ERR("Server is built without SSL support\n");
+        return false;
+    }
+    pimpl->srv.reset(new httplib::Server());
+#endif
+
+    auto & srv = pimpl->srv;
+    srv->set_default_headers({{"Server", "llama.cpp"}});
+    srv->set_logger(log_server_request);
+    srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
+        std::string message;
+        try {
+            std::rethrow_exception(ep);
+        } catch (const std::exception & e) {
+            message = e.what();
+        } catch (...) {
+            message = "Unknown Exception";
+        }
+
+        // FIXME
+        GGML_UNUSED(res);
+        GGML_UNUSED(message);
+        // try {
+        //     json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
+        //     LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
+        //     res_error(res, formatted_error);
+        // } catch (const std::exception & e) {
+        //     LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
+        // }
+    });
+
+    srv->set_error_handler([](const httplib::Request &, httplib::Response & res) {
+        if (res.status == 404) {
+            // FIXME
+            //res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND));
+            res.set_content("404 Not Found", "text/plain");
+        }
+        // for other error codes, we skip processing here because it's already done by res_error()
+    });
+
+    // set timeouts and change hostname and port
+    srv->set_read_timeout (params.timeout_read);
+    srv->set_write_timeout(params.timeout_write);
+
+    if (params.api_keys.size() == 1) {
+        auto key = params.api_keys[0];
+        std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
+        LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str());
+    } else if (params.api_keys.size() > 1) {
+        LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size());
+    }
+
+    //
+    // Middlewares
+    //
+
+    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
+        static const std::unordered_set<std::string> public_endpoints = {
+            "/health",
+            "/v1/health",
+            "/models",
+            "/v1/models",
+            "/api/tags"
+        };
+
+        // If API key is not set, skip validation
+        if (api_keys.empty()) {
+            return true;
+        }
+
+        // If path is public or is static file, skip validation
+        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
+            return true;
+        }
+
+        // Check for API key in the header
+        auto auth_header = req.get_header_value("Authorization");
+
+        std::string prefix = "Bearer ";
+        if (auth_header.substr(0, prefix.size()) == prefix) {
+            std::string received_api_key = auth_header.substr(prefix.size());
+            if (std::find(api_keys.begin(), api_keys.end(), received_api_key) != api_keys.end()) {
+                return true; // API key is valid
+            }
+        }
+
+        // API key is invalid or not provided
+        //res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
+        // FIXME
+        res.status = 401;
+        res.set_content("Unauthorized: Invalid API Key", "text/plain");
+
+        LOG_WRN("Unauthorized: Invalid API Key\n");
+
+        return false;
+    };
+
+    auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
+        bool ready = is_ready.load();
+        if (!ready) {
+            auto tmp = string_split<std::string>(req.path, '.');
+            if (req.path == "/" || tmp.back() == "html") {
+                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
+                res.status = 503;
+            } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
+                // allow the models endpoint to be accessed during loading
+                return true;
+            } else {
+                // FIXME
+                //res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
+                res.status = 503;
+                res.set_content("503 Service Unavailable: Loading model", "text/plain");
+            }
+            return false;
+        }
+        return true;
+    };
+
+    // register server middlewares
+    srv->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) {
+        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
+        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
+        if (req.method == "OPTIONS") {
+            res.set_header("Access-Control-Allow-Credentials", "true");
+            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
+            res.set_header("Access-Control-Allow-Headers",     "*");
+            res.set_content("", "text/html"); // blank response, no data
+            return httplib::Server::HandlerResponse::Handled; // skip further processing
+        }
+        if (!middleware_server_state(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        if (!middleware_validate_api_key(req, res)) {
+            return httplib::Server::HandlerResponse::Handled;
+        }
+        return httplib::Server::HandlerResponse::Unhandled;
+    });
+
+    int n_threads_http = params.n_threads_http;
+    if (n_threads_http < 1) {
+        // +2 threads for monitoring endpoints
+        n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
+    }
+    LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http);
+    srv->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); };
+
+    //
+    // Web UI setup
+    //
+
+    if (!params.webui) {
+        LOG_INF("Web UI is disabled\n");
+    } else {
+        // register static assets routes
+        if (!params.public_path.empty()) {
+            // Set the base directory for serving static files
+            bool is_found = srv->set_mount_point(params.api_prefix + "/", params.public_path);
+            if (!is_found) {
+                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
+                return 1;
+            }
+        } else {
+            // using embedded static index.html
+            srv->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
+                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
+                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
+                } else {
+                    res.set_header("Content-Encoding", "gzip");
+                    // COEP and COOP headers, required by pyodide (python interpreter)
+                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
+                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
+                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
+                }
+                return false;
+            });
+        }
+    }
+    return true;
+}
+
+bool server_http_context::start() {
+    // Bind and listen
+
+    auto & srv = pimpl->srv;
+    bool was_bound = false;
+    bool is_sock = false;
+    if (string_ends_with(std::string(hostname), ".sock")) {
+        is_sock = true;
+        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
+        srv->set_address_family(AF_UNIX);
+        // bind_to_port requires a second arg, any value other than 0 should
+        // simply get ignored
+        was_bound = srv->bind_to_port(hostname, 8080);
+    } else {
+        LOG_INF("%s: binding port with default address family\n", __func__);
+        // bind HTTP listen port
+        if (port == 0) {
+            int bound_port = srv->bind_to_any_port(hostname);
+            if ((was_bound = (bound_port >= 0))) {
+                port = bound_port;
+            }
+        } else {
+            was_bound = srv->bind_to_port(hostname, port);
+        }
+    }
+
+    if (!was_bound) {
+        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port);
+        return false;
+    }
+
+    // run the HTTP server in a thread
+    thread = std::thread([this]() { pimpl->srv->listen_after_bind(); });
+    srv->wait_until_ready();
+
+    LOG_INF("%s: server is listening on %s\n", __func__,
+            is_sock ? string_format("unix://%s",    hostname.c_str()).c_str() :
+                      string_format("http://%s:%d", hostname.c_str(), port).c_str());
+    return true;
+}
+
+void server_http_context::stop() {
+    if (pimpl->srv) {
+        pimpl->srv->stop();
+    }
+}
+
+static void set_headers(httplib::Response & res, const std::map<std::string, std::string> & headers) {
+    for (const auto & [key, value] : headers) {
+        res.set_header(key, value);
+    }
+}
+
+static std::map<std::string, std::string> get_params(const httplib::Request & req) {
+    std::map<std::string, std::string> params;
+    for (const auto & [key, value] : req.params) {
+        params[key] = value;
+    }
+    for (const auto & [key, value] : req.path_params) {
+        params[key] = value;
+    }
+    return params;
+}
+
+void server_http_context::get(const std::string & path, server_http_context::handler_t handler) {
+    pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_resgen_ptr response = handler(server_http_request{
+            get_params(req),
+            json{},
+            req.is_connection_closed
+        });
+        GGML_ASSERT(!response->is_stream() && "not supported for GET method");
+        res.status = response->status;
+        set_headers(res, response->headers);
+        res.set_content(response->data, response->content_type);
+    });
+}
+
+void server_http_context::post(const std::string & path, server_http_context::handler_t handler) {
+    pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
+        server_http_resgen_ptr response = handler(server_http_request{
+            get_params(req),
+            json::parse(req.body.empty() ? "{}" : req.body),
+            req.is_connection_closed
+        });
+        if (response->is_stream()) {
+            res.status = response->status;
+            set_headers(res, response->headers);
+            std::string content_type = response->content_type;
+            // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
+            std::shared_ptr<server_http_resgen> r_ptr = std::move(response);
+            const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
+                // TODO: maybe handle sink.write unsuccessful case? for now, we rely on is_connection_closed()
+                sink.write(response->data.data(), response->data.size());
+                SRV_DBG("http: streamed chunk: %s\n", response->data.c_str());
+                if (!response->next()) {
+                    SRV_DBG("%s", "http: stream ended\n");
+                    sink.done();
+                    return false; // end of stream
+                }
+                return true;
+            };
+            const auto on_complete = [response = r_ptr](bool) mutable {
+                response.reset(); // trigger the destruction of the response object
+            };
+            res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
+        } else {
+            res.status = response->status;
+            set_headers(res, response->headers);
+            res.set_content(response->data, response->content_type);
+        }
+    });
+}
+
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 5d93b94b6ce95..b0dc510e2ec33 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -7,18 +7,17 @@
 #include <string>
 #include <thread>
 
-// auto generated files (see README.md for details)
-#include "index.html.gz.hpp"
-#include "loading.html.hpp"
-
 // generator-like API for HTTP response generation
+// this object response with one of the 2 modes:
+// 1) normal response: `data` contains the full response body
+// 2) streaming response: each call to next() generates the next chunk, stored in `data`
+//    when next() returns false, the stream ends
 struct server_http_resgen {
     std::string content_type = "application/json; charset=utf-8";
     int status = 200;
     std::string data;
+    std::map<std::string, std::string> headers;
 
-    // if is_stream is true, next() will return true until the stream ends
-    // the data member will contain the next chunk of data to send
     // TODO: move this to a virtual function once we have proper polymorphism support
     std::function<bool()> next = nullptr;
     bool is_stream() const {
@@ -33,20 +32,33 @@ struct server_http_resgen {
 using server_http_resgen_ptr = std::unique_ptr<server_http_resgen>;
 
 struct server_http_request {
-    std::unordered_map<std::string, std::string> query_params;
+    std::map<std::string, std::string> params; // path_params + query_params
     json body;
     const std::function<bool()> & should_stop;
+
+    std::string get_param(const std::string & key, const std::string & def = "") const {
+        auto it = params.find(key);
+        if (it != params.end()) {
+            return it->second;
+        }
+        return def;
+    }
 };
 
 struct server_http_context {
-    std::thread thread;
-    std::unique_ptr<httplib::Server> svr;
+    class Impl;
+    std::unique_ptr<Impl> pimpl;
+
+    std::thread thread; // server thread
     std::atomic<bool> is_ready = false;
 
     std::string path_prefix;
     std::string hostname;
     int port;
 
+    server_http_context();
+    ~server_http_context();
+
     bool init(const common_params & params);
     bool start();
     void stop();
@@ -55,308 +67,3 @@ struct server_http_context {
     void get(const std::string &, handler_t);
     void post(const std::string &, handler_t);
 };
-
-// implementation details
-
-static void log_server_request(const httplib::Request & req, const httplib::Response & res) {
-    // skip GH copilot requests when using default port
-    if (req.path == "/v1/health") {
-        return;
-    }
-
-    // reminder: this function is not covered by httplib's exception handler; if someone does more complicated stuff, think about wrapping it in try-catch
-
-    SRV_INF("request: %s %s %s %d\n", req.method.c_str(), req.path.c_str(), req.remote_addr.c_str(), res.status);
-
-    SRV_DBG("request:  %s\n", req.body.c_str());
-    SRV_DBG("response: %s\n", res.body.c_str());
-}
-
-bool server_http_context::init(const common_params & params) {
-    path_prefix = params.api_prefix;
-    port = params.port;
-    hostname = params.hostname;
-
-#ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
-        LOG_INF("Running with SSL: key = %s, cert = %s\n", params.ssl_file_key.c_str(), params.ssl_file_cert.c_str());
-        svr.reset(
-            new httplib::SSLServer(params.ssl_file_cert.c_str(), params.ssl_file_key.c_str())
-        );
-    } else {
-        LOG_INF("Running without SSL\n");
-        svr.reset(new httplib::Server());
-    }
-#else
-    if (params.ssl_file_key != "" && params.ssl_file_cert != "") {
-        LOG_ERR("Server is built without SSL support\n");
-        return false;
-    }
-    svr.reset(new httplib::Server());
-#endif
-
-    svr->set_default_headers({{"Server", "llama.cpp"}});
-    svr->set_logger(log_server_request);
-    svr->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
-        std::string message;
-        try {
-            std::rethrow_exception(ep);
-        } catch (const std::exception & e) {
-            message = e.what();
-        } catch (...) {
-            message = "Unknown Exception";
-        }
-
-        // FIXME
-        GGML_UNUSED(res);
-        GGML_UNUSED(message);
-        // try {
-        //     json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
-        //     LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
-        //     res_error(res, formatted_error);
-        // } catch (const std::exception & e) {
-        //     LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
-        // }
-    });
-
-    svr->set_error_handler([](const httplib::Request &, httplib::Response & res) {
-        if (res.status == 404) {
-            // FIXME
-            //res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND));
-            res.set_content("404 Not Found", "text/plain");
-        }
-        // for other error codes, we skip processing here because it's already done by res_error()
-    });
-
-    // set timeouts and change hostname and port
-    svr->set_read_timeout (params.timeout_read);
-    svr->set_write_timeout(params.timeout_write);
-
-    if (params.api_keys.size() == 1) {
-        auto key = params.api_keys[0];
-        std::string substr = key.substr(std::max((int)(key.length() - 4), 0));
-        LOG_INF("%s: api_keys: ****%s\n", __func__, substr.c_str());
-    } else if (params.api_keys.size() > 1) {
-        LOG_INF("%s: api_keys: %zu keys loaded\n", __func__, params.api_keys.size());
-    }
-
-    //
-    // Middlewares
-    //
-
-    auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) {
-        static const std::unordered_set<std::string> public_endpoints = {
-            "/health",
-            "/v1/health",
-            "/models",
-            "/v1/models",
-            "/api/tags"
-        };
-
-        // If API key is not set, skip validation
-        if (api_keys.empty()) {
-            return true;
-        }
-
-        // If path is public or is static file, skip validation
-        if (public_endpoints.find(req.path) != public_endpoints.end() || req.path == "/") {
-            return true;
-        }
-
-        // Check for API key in the header
-        auto auth_header = req.get_header_value("Authorization");
-
-        std::string prefix = "Bearer ";
-        if (auth_header.substr(0, prefix.size()) == prefix) {
-            std::string received_api_key = auth_header.substr(prefix.size());
-            if (std::find(api_keys.begin(), api_keys.end(), received_api_key) != api_keys.end()) {
-                return true; // API key is valid
-            }
-        }
-
-        // API key is invalid or not provided
-        //res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
-        // FIXME
-        res.status = 401;
-        res.set_content("Unauthorized: Invalid API Key", "text/plain");
-
-        LOG_WRN("Unauthorized: Invalid API Key\n");
-
-        return false;
-    };
-
-    auto middleware_server_state = [this](const httplib::Request & req, httplib::Response & res) {
-        bool ready = is_ready.load();
-        if (!ready) {
-            auto tmp = string_split<std::string>(req.path, '.');
-            if (req.path == "/" || tmp.back() == "html") {
-                res.set_content(reinterpret_cast<const char*>(loading_html), loading_html_len, "text/html; charset=utf-8");
-                res.status = 503;
-            } else if (req.path == "/models" || req.path == "/v1/models" || req.path == "/api/tags") {
-                // allow the models endpoint to be accessed during loading
-                return true;
-            } else {
-                // FIXME
-                //res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
-                res.status = 503;
-                res.set_content("503 Service Unavailable: Loading model", "text/plain");
-            }
-            return false;
-        }
-        return true;
-    };
-
-    // register server middlewares
-    svr->set_pre_routing_handler([middleware_validate_api_key, middleware_server_state](const httplib::Request & req, httplib::Response & res) {
-        res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
-        // If this is OPTIONS request, skip validation because browsers don't include Authorization header
-        if (req.method == "OPTIONS") {
-            res.set_header("Access-Control-Allow-Credentials", "true");
-            res.set_header("Access-Control-Allow-Methods",     "GET, POST");
-            res.set_header("Access-Control-Allow-Headers",     "*");
-            res.set_content("", "text/html"); // blank response, no data
-            return httplib::Server::HandlerResponse::Handled; // skip further processing
-        }
-        if (!middleware_server_state(req, res)) {
-            return httplib::Server::HandlerResponse::Handled;
-        }
-        if (!middleware_validate_api_key(req, res)) {
-            return httplib::Server::HandlerResponse::Handled;
-        }
-        return httplib::Server::HandlerResponse::Unhandled;
-    });
-
-    int n_threads_http = params.n_threads_http;
-    if (n_threads_http < 1) {
-        // +2 threads for monitoring endpoints
-        n_threads_http = std::max(params.n_parallel + 2, (int32_t) std::thread::hardware_concurrency() - 1);
-    }
-    LOG_INF("%s: using %d threads for HTTP server\n", __func__, n_threads_http);
-    svr->new_task_queue = [n_threads_http] { return new httplib::ThreadPool(n_threads_http); };
-
-    //
-    // Web UI setup
-    //
-
-    if (!params.webui) {
-        LOG_INF("Web UI is disabled\n");
-    } else {
-        // register static assets routes
-        if (!params.public_path.empty()) {
-            // Set the base directory for serving static files
-            bool is_found = svr->set_mount_point(params.api_prefix + "/", params.public_path);
-            if (!is_found) {
-                LOG_ERR("%s: static assets path not found: %s\n", __func__, params.public_path.c_str());
-                return 1;
-            }
-        } else {
-            // using embedded static index.html
-            svr->Get(params.api_prefix + "/", [](const httplib::Request & req, httplib::Response & res) {
-                if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) {
-                    res.set_content("Error: gzip is not supported by this browser", "text/plain");
-                } else {
-                    res.set_header("Content-Encoding", "gzip");
-                    // COEP and COOP headers, required by pyodide (python interpreter)
-                    res.set_header("Cross-Origin-Embedder-Policy", "require-corp");
-                    res.set_header("Cross-Origin-Opener-Policy", "same-origin");
-                    res.set_content(reinterpret_cast<const char*>(index_html_gz), index_html_gz_len, "text/html; charset=utf-8");
-                }
-                return false;
-            });
-        }
-    }
-    return true;
-}
-
-bool server_http_context::start() {
-    // Bind and listen
-
-    bool was_bound = false;
-    bool is_sock = false;
-    if (string_ends_with(std::string(hostname), ".sock")) {
-        is_sock = true;
-        LOG_INF("%s: setting address family to AF_UNIX\n", __func__);
-        svr->set_address_family(AF_UNIX);
-        // bind_to_port requires a second arg, any value other than 0 should
-        // simply get ignored
-        was_bound = svr->bind_to_port(hostname, 8080);
-    } else {
-        LOG_INF("%s: binding port with default address family\n", __func__);
-        // bind HTTP listen port
-        if (port == 0) {
-            int bound_port = svr->bind_to_any_port(hostname);
-            if ((was_bound = (bound_port >= 0))) {
-                port = bound_port;
-            }
-        } else {
-            was_bound = svr->bind_to_port(hostname, port);
-        }
-    }
-
-    if (!was_bound) {
-        LOG_ERR("%s: couldn't bind HTTP server socket, hostname: %s, port: %d\n", __func__, hostname.c_str(), port);
-        return false;
-    }
-
-    // run the HTTP server in a thread
-    thread = std::thread([this]() { svr->listen_after_bind(); });
-    svr->wait_until_ready();
-
-    LOG_INF("%s: server is listening on %s\n", __func__,
-            is_sock ? string_format("unix://%s",    hostname.c_str()).c_str() :
-                      string_format("http://%s:%d", hostname.c_str(), port).c_str());
-    return true;
-}
-
-void server_http_context::stop() {
-    if (svr) {
-        svr->stop();
-    }
-}
-
-void server_http_context::get(const std::string & path, server_http_context::handler_t handler) {
-    svr->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
-        server_http_resgen_ptr response = handler(server_http_request{
-            req.path_params,
-            json{},
-            req.is_connection_closed
-        });
-        GGML_ASSERT(!response->is_stream() && "not supported for GET method");
-        res.status = response->status;
-        res.set_content(response->data, response->content_type);
-    });
-}
-
-void server_http_context::post(const std::string & path, server_http_context::handler_t handler) {
-    svr->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
-        server_http_resgen_ptr response = handler(server_http_request{
-            req.path_params,
-            json::parse(req.body.empty() ? "{}" : req.body),
-            req.is_connection_closed
-        });
-        if (response->is_stream()) {
-            res.status = response->status;
-            std::string content_type = response->content_type;
-            // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
-            std::shared_ptr<server_http_resgen> r_ptr = std::move(response);
-            const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
-                // TODO: maybe handle sink.write unsuccessful case? for now, we rely on is_connection_closed()
-                sink.write(response->data.data(), response->data.size());
-                SRV_DBG("http: streamed chunk: %s\n", response->data.c_str());
-                if (!response->next()) {
-                    SRV_DBG("%s", "http: stream ended\n");
-                    sink.done();
-                    return false; // end of stream
-                }
-                return true;
-            };
-            const auto on_complete = [response = r_ptr](bool) mutable {
-                response.reset(); // trigger the destruction of the response object
-            };
-            res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
-        } else {
-            res.status = response->status;
-            res.set_content(response->data, response->content_type);
-        }
-    });
-}
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 2b20cf0b87c99..d828c922f26c9 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -9,8 +9,6 @@
 #include "mtmd-helper.h"
 #include "chat.h"
 
-#include <cpp-httplib/httplib.h>
-
 #define JSON_ASSERT GGML_ASSERT
 #include <nlohmann/json.hpp>
 
@@ -21,6 +19,9 @@
 #include <memory>
 #include <cinttypes>
 
+#define JSON_ASSERT GGML_ASSERT
+#include <nlohmann/json.hpp>
+
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
 
 using json = nlohmann::ordered_json;

From 473b0e58236fb80944af18c2f12027779fb78395 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 12 Nov 2025 21:59:01 +0100
Subject: [PATCH 03/21] add the remaining endpoints

---
 tools/server/server-http.cpp |  12 +-
 tools/server/server-http.h   |   4 +-
 tools/server/server.cpp      | 865 ++++++++++++++++++++++++++++++++++-
 3 files changed, 860 insertions(+), 21 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 0324e8d69f96c..5b3b665f4b22f 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -80,6 +80,7 @@ bool server_http_context::init(const common_params & params) {
         // FIXME
         GGML_UNUSED(res);
         GGML_UNUSED(message);
+        printf("Exception caught in HTTP server: %s\n", message.c_str());
         // try {
         //     json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
         //     LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
@@ -306,7 +307,7 @@ void server_http_context::get(const std::string & path, server_http_context::han
     pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
         server_http_resgen_ptr response = handler(server_http_request{
             get_params(req),
-            json{},
+            req.body,
             req.is_connection_closed
         });
         GGML_ASSERT(!response->is_stream() && "not supported for GET method");
@@ -320,7 +321,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha
     pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
         server_http_resgen_ptr response = handler(server_http_request{
             get_params(req),
-            json::parse(req.body.empty() ? "{}" : req.body),
+            req.body,
             req.is_connection_closed
         });
         if (response->is_stream()) {
@@ -330,15 +331,18 @@ void server_http_context::post(const std::string & path, server_http_context::ha
             // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
             std::shared_ptr<server_http_resgen> r_ptr = std::move(response);
             const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
-                // TODO: maybe handle sink.write unsuccessful case? for now, we rely on is_connection_closed()
+                // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
                 sink.write(response->data.data(), response->data.size());
                 SRV_DBG("http: streamed chunk: %s\n", response->data.c_str());
                 if (!response->next()) {
+                    // flush the remaining data
+                    sink.write(response->data.data(), response->data.size());
+                    SRV_DBG("http: streamed chunk (last): %s\n", response->data.c_str());
                     SRV_DBG("%s", "http: stream ended\n");
                     sink.done();
                     return false; // end of stream
                 }
-                return true;
+                return true; // more data, continue the loop
             };
             const auto on_complete = [response = r_ptr](bool) mutable {
                 response.reset(); // trigger the destruction of the response object
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index b0dc510e2ec33..f449bfedc27b0 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -11,7 +11,7 @@
 // this object response with one of the 2 modes:
 // 1) normal response: `data` contains the full response body
 // 2) streaming response: each call to next() generates the next chunk, stored in `data`
-//    when next() returns false, the stream ends
+//    when next() returns false, no more data after the current chunk
 struct server_http_resgen {
     std::string content_type = "application/json; charset=utf-8";
     int status = 200;
@@ -33,7 +33,7 @@ using server_http_resgen_ptr = std::unique_ptr<server_http_resgen>;
 
 struct server_http_request {
     std::map<std::string, std::string> params; // path_params + query_params
-    json body;
+    std::string body;
     const std::function<bool()> & should_stop;
 
     std::string get_param(const std::string & key, const std::string & def = "") const {
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index e4f0167fa838c..1f4ca79278c7d 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -19,6 +19,7 @@
 #include <deque>
 #include <memory>
 #include <mutex>
+#include <list>
 #include <signal.h>
 #include <thread>
 #include <unordered_map>
@@ -1665,7 +1666,7 @@ struct server_slot {
     server_prompt prompt;
 
     void prompt_save(server_prompt_cache & prompt_cache) const {
-        assert(prompt.data.size() == 0);
+        GGML_ASSERT(prompt.data.size() == 0);
 
         const size_t cur_size = llama_state_seq_get_size_ext(ctx, id, 0);
 
@@ -4429,8 +4430,11 @@ struct server_resgen : server_http_resgen {
 };
 
 struct server_routes {
+    const common_params & params;
     server_context & ctx_server;
-    server_routes(server_context & ctx_server_) : ctx_server(ctx_server_) {}
+    server_http_context & ctx_http; // for reading is_ready
+    server_routes(const common_params & params, server_context & ctx_server, server_http_context & ctx_http)
+        : params(params), ctx_server(ctx_server), ctx_http(ctx_http) {}
 
 public:
     // handlers using lambda function, so that they can capture `this` without `std::bind`
@@ -4442,11 +4446,352 @@ struct server_routes {
         return res;
     };
 
+    server_http_context::handler_t get_metrics = [this](const server_http_request &) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        if (!params.endpoint_metrics) {
+            res->error(format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // request slots data using task queue
+        // TODO: use server_response_reader
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_METRICS);
+            task.id = task_id;
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
+        }
+
+        // get the result
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        // TODO: get rid of this dynamic_cast
+        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_task != nullptr);
+
+        // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
+        json all_metrics_def = json {
+            {"counter", {{
+                    {"name",  "prompt_tokens_total"},
+                    {"help",  "Number of prompt tokens processed."},
+                    {"value",  (uint64_t) res_task->n_prompt_tokens_processed_total}
+            }, {
+                    {"name",  "prompt_seconds_total"},
+                    {"help",  "Prompt process time"},
+                    {"value",  (uint64_t) res_task->t_prompt_processing_total / 1.e3}
+            }, {
+                    {"name",  "tokens_predicted_total"},
+                    {"help",  "Number of generation tokens processed."},
+                    {"value",  (uint64_t) res_task->n_tokens_predicted_total}
+            }, {
+                    {"name",  "tokens_predicted_seconds_total"},
+                    {"help",  "Predict process time"},
+                    {"value",  (uint64_t) res_task->t_tokens_generation_total / 1.e3}
+            }, {
+                    {"name",  "n_decode_total"},
+                    {"help",  "Total number of llama_decode() calls"},
+                    {"value",  res_task->n_decode_total}
+            }, {
+                    {"name",  "n_tokens_max"},
+                    {"help",  "Largest observed n_tokens."},
+                    {"value",  res_task->n_tokens_max}
+            }, {
+                    {"name",  "n_busy_slots_per_decode"},
+                    {"help",  "Average number of busy slots per llama_decode() call"},
+                    {"value",  (float) res_task->n_busy_slots_total / std::max((float) res_task->n_decode_total, 1.f)}
+            }}},
+            {"gauge", {{
+                    {"name",  "prompt_tokens_seconds"},
+                    {"help",  "Average prompt throughput in tokens/s."},
+                    {"value",  res_task->n_prompt_tokens_processed ? 1.e3 / res_task->t_prompt_processing * res_task->n_prompt_tokens_processed : 0.}
+            },{
+                    {"name",  "predicted_tokens_seconds"},
+                    {"help",  "Average generation throughput in tokens/s."},
+                    {"value",  res_task->n_tokens_predicted ? 1.e3 / res_task->t_tokens_generation * res_task->n_tokens_predicted : 0.}
+            },{
+                    {"name",  "requests_processing"},
+                    {"help",  "Number of requests processing."},
+                    {"value",  (uint64_t) res_task->n_processing_slots}
+            },{
+                    {"name",  "requests_deferred"},
+                    {"help",  "Number of requests deferred."},
+                    {"value",  (uint64_t) res_task->n_tasks_deferred}
+            }}}
+        };
+
+        std::stringstream prometheus;
+
+        for (const auto & el : all_metrics_def.items()) {
+            const auto & type        = el.key();
+            const auto & metrics_def = el.value();
+
+            for (const auto & metric_def : metrics_def) {
+                const std::string name = metric_def.at("name");
+                const std::string help = metric_def.at("help");
+
+                auto value = json_value(metric_def, "value", 0.);
+                prometheus << "# HELP llamacpp:" << name << " " << help  << "\n"
+                            << "# TYPE llamacpp:" << name << " " << type  << "\n"
+                            << "llamacpp:"        << name << " " << value << "\n";
+            }
+        }
+
+        res->headers["Process-Start-Time-Unix"] = std::to_string(res_task->t_start);
+        res->content_type = "text/plain; version=0.0.4";
+        res->ok(prometheus.str());
+        return res;
+    };
+
+    server_http_context::handler_t get_slots = [this](const server_http_request & req) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        if (!params.endpoint_slots) {
+            res->error(format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // request slots data using task queue
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_METRICS);
+            task.id = task_id;
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task), true); // high-priority task
+        }
+
+        // get the result
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        // TODO: get rid of this dynamic_cast
+        auto res_task = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_task != nullptr);
+
+        // optionally return "fail_on_no_slot" error
+        if (!req.get_param("fail_on_no_slot").empty()) {
+            if (res_task->n_idle_slots == 0) {
+                res->error(format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
+                return res;
+            }
+        }
+
+        res->ok(res_task->slots_data);
+        return res;
+    };
+
+    server_http_context::handler_t post_slots = [this](const server_http_request & req) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        if (params.slot_save_path.empty()) {
+            res->error(format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        std::string id_slot_str = req.get_param("id_slot");
+        int id_slot;
+
+        try {
+            id_slot = std::stoi(id_slot_str);
+        } catch (const std::exception &) {
+            res->error(format_error_response("Invalid slot ID", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        std::string action = req.get_param("action");
+
+        if (action == "save") {
+            return handle_slots_save(req, id_slot);
+        } else if (action == "restore") {
+            return handle_slots_restore(req, id_slot);
+        } else if (action == "erase") {
+            return handle_slots_erase(req, id_slot);
+        } else {
+            res->error(format_error_response("Invalid action", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+    };
+
+    server_http_context::handler_t get_props = [this](const server_http_request &) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        json default_generation_settings_for_props;
+
+        {
+            slot_params params;
+
+            params.sampling = ctx_server.params_base.sampling;
+
+            default_generation_settings_for_props = json {
+                {"params", params.to_json(true)},
+                {"n_ctx",  ctx_server.slots[0].n_ctx},
+            };
+        }
+
+        // this endpoint is publicly available, please only return what is safe to be exposed
+        json data = {
+            { "default_generation_settings", default_generation_settings_for_props },
+            { "total_slots",                 ctx_server.params_base.n_parallel },
+            { "model_alias",                 ctx_server.params_base.model_alias },
+            { "model_path",                  ctx_server.params_base.model.path },
+            { "modalities",                  json {
+                {"vision", ctx_server.oai_parser_opt.allow_image},
+                {"audio",  ctx_server.oai_parser_opt.allow_audio},
+            } },
+            { "endpoint_slots",              params.endpoint_slots },
+            { "endpoint_props",              params.endpoint_props },
+            { "endpoint_metrics",            params.endpoint_metrics },
+            { "webui",                       params.webui },
+            { "chat_template",               common_chat_templates_source(ctx_server.chat_templates.get()) },
+            { "bos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)},
+            { "eos_token",                   common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)},
+            { "build_info",                  build_info },
+        };
+        if (ctx_server.params_base.use_jinja) {
+            if (auto tool_use_src = common_chat_templates_source(ctx_server.chat_templates.get(), "tool_use")) {
+                data["chat_template_tool_use"] = tool_use_src;
+            }
+        }
+
+        res->ok(data);
+        return res;
+    };
+
+    server_http_context::handler_t post_props = [this](const server_http_request &) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        if (!params.endpoint_props) {
+            res->error(format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+        // update any props here
+
+        res->ok({{ "success", true }});
+        return res;
+    };
+
+    server_http_context::handler_t get_api_show = [this](const server_http_request &) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        bool has_mtmd = ctx_server.mctx != nullptr;
+        json data = {
+            {
+                "template", common_chat_templates_source(ctx_server.chat_templates.get()),
+            },
+            {
+                "model_info", {
+                    { "llama.context_length", ctx_server.slots.back().n_ctx, },
+                }
+            },
+            {"modelfile", ""},
+            {"parameters", ""},
+            {"template", common_chat_templates_source(ctx_server.chat_templates.get())},
+            {"details", {
+                {"parent_model", ""},
+                {"format", "gguf"},
+                {"family", ""},
+                {"families", {""}},
+                {"parameter_size", ""},
+                {"quantization_level", ""}
+            }},
+            {"model_info", ""},
+            {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}
+        };
+
+        res->ok(data);
+        return res;
+    };
+
+    server_http_context::handler_t post_infill = [this](const server_http_request & req) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        // check model compatibility
+        std::string err;
+        if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "prefix token is missing. ";
+        }
+        if (llama_vocab_fim_suf(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "suffix token is missing. ";
+        }
+        if (llama_vocab_fim_mid(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
+            err += "middle token is missing. ";
+        }
+        if (!err.empty()) {
+            res->error(format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        // validate input
+        json data = json::parse(req.body);
+        if (data.contains("prompt") && !data.at("prompt").is_string()) {
+            // prompt is optional
+            res->error(format_error_response("\"prompt\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_prefix")) {
+            res->error(format_error_response("\"input_prefix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (!data.contains("input_suffix")) {
+            res->error(format_error_response("\"input_suffix\" is required", ERROR_TYPE_INVALID_REQUEST));
+        }
+
+        if (data.contains("input_extra") && !data.at("input_extra").is_array()) {
+            // input_extra is optional
+            res->error(format_error_response("\"input_extra\" must be an array of {\"filename\": string, \"text\": string}", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        json input_extra = json_value(data, "input_extra", json::array());
+        for (const auto & chunk : input_extra) {
+            // { "text": string, "filename": string }
+            if (!chunk.contains("text") || !chunk.at("text").is_string()) {
+                res->error(format_error_response("extra_context chunk must contain a \"text\" field with a string value", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+            // filename is optional
+            if (chunk.contains("filename") && !chunk.at("filename").is_string()) {
+                res->error(format_error_response("extra_context chunk's \"filename\" field must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        }
+        data["input_extra"] = input_extra; // default to empty array if it's not exist
+
+        std::string prompt = json_value(data, "prompt", std::string());
+        std::vector<server_tokens> tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true);
+        SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size());
+        data["prompt"] = format_infill(
+            ctx_server.vocab,
+            data.at("input_prefix"),
+            data.at("input_suffix"),
+            data.at("input_extra"),
+            ctx_server.params_base.n_batch,
+            ctx_server.params_base.n_predict,
+            ctx_server.slots[0].n_ctx, // TODO: there should be a better way
+            ctx_server.params_base.spm_infill,
+            tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal.
+        );
+
+        std::vector<raw_buffer> files; // dummy
+        return handle_completions_impl(
+            SERVER_TASK_TYPE_INFILL,
+            data,
+            files,
+            req.should_stop,
+            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
+    };
+
     server_http_context::handler_t post_completions = [this](const server_http_request & req) {
         std::vector<raw_buffer> files; // dummy
+        const json body = json::parse(req.body);
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
-            req.body,
+            body,
             files,
             req.should_stop,
             OAICOMPAT_TYPE_NONE);
@@ -4454,9 +4799,10 @@ struct server_routes {
 
     server_http_context::handler_t post_completions_oai = [this](const server_http_request & req) {
         std::vector<raw_buffer> files; // dummy
+        const json body = json::parse(req.body);
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
-            req.body,
+            body,
             files,
             req.should_stop,
             OAICOMPAT_TYPE_COMPLETION);
@@ -4464,21 +4810,288 @@ struct server_routes {
 
     server_http_context::handler_t post_chat_completions = [this](const server_http_request & req) {
         std::vector<raw_buffer> files;
-        json body_cpy = req.body;
-        json data = oaicompat_chat_params_parse(
-            body_cpy,
+        json body = json::parse(req.body);
+        json body_parsed = oaicompat_chat_params_parse(
+            body,
             ctx_server.oai_parser_opt,
             files);
         return handle_completions_impl(
             SERVER_TASK_TYPE_COMPLETION,
-            body_cpy,
+            body_parsed,
             files,
             req.should_stop,
             OAICOMPAT_TYPE_CHAT);
     };
 
+    // same with handle_chat_completions, but without inference part
+    server_http_context::handler_t post_apply_template = [this](const server_http_request & req) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        std::vector<raw_buffer> files; // dummy, unused
+        json body = json::parse(req.body);
+        json data = oaicompat_chat_params_parse(
+            body,
+            ctx_server.oai_parser_opt,
+            files);
+        res->ok({{ "prompt", std::move(data.at("prompt")) }});
+        return res;
+    };
+
+    server_http_context::handler_t get_models = [this](const server_http_request &) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        bool is_model_ready = ctx_http.is_ready.load();
+        json model_meta = nullptr;
+        if (is_model_ready) {
+            model_meta = ctx_server.model_meta();
+        }
+        bool has_mtmd = ctx_server.mctx != nullptr;
+        json models = {
+            {"models", {
+                {
+                    {"name", params.model_alias.empty() ? params.model.path : params.model_alias},
+                    {"model", params.model_alias.empty() ? params.model.path : params.model_alias},
+                    {"modified_at", ""},
+                    {"size", ""},
+                    {"digest", ""}, // dummy value, llama.cpp does not support managing model file's hash
+                    {"type", "model"},
+                    {"description", ""},
+                    {"tags", {""}},
+                    {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})},
+                    {"parameters", ""},
+                    {"details", {
+                        {"parent_model", ""},
+                        {"format", "gguf"},
+                        {"family", ""},
+                        {"families", {""}},
+                        {"parameter_size", ""},
+                        {"quantization_level", ""}
+                    }}
+                }
+            }},
+            {"object", "list"},
+            {"data", {
+                {
+                    {"id",       params.model_alias.empty() ? params.model.path : params.model_alias},
+                    {"object",   "model"},
+                    {"created",  std::time(0)},
+                    {"owned_by", "llamacpp"},
+                    {"meta",     model_meta},
+                },
+            }}
+        };
+
+        res->ok(models);
+        return res;
+    };
+
+    server_http_context::handler_t post_tokenize = [this](const server_http_request & req) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        const json body = json::parse(req.body);
+        json tokens_response = json::array();
+        if (body.count("content") != 0) {
+            const bool add_special = json_value(body, "add_special", false);
+            const bool parse_special = json_value(body, "parse_special", true);
+            const bool with_pieces = json_value(body, "with_pieces", false);
+
+            llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special);
+
+            if (with_pieces) {
+                for (const auto& token : tokens) {
+                    std::string piece = common_token_to_piece(ctx_server.ctx, token);
+                    json piece_json;
+
+                    // Check if the piece is valid UTF-8
+                    if (is_valid_utf8(piece)) {
+                        piece_json = piece;
+                    } else {
+                        // If not valid UTF-8, store as array of byte values
+                        piece_json = json::array();
+                        for (unsigned char c : piece) {
+                            piece_json.push_back(static_cast<int>(c));
+                        }
+                    }
+
+                    tokens_response.push_back({
+                        {"id", token},
+                        {"piece", piece_json}
+                    });
+                }
+            } else {
+                tokens_response = tokens;
+            }
+        }
+
+        const json data = format_tokenizer_response(tokens_response);
+        res->ok(data);
+        return res;
+    };
+
+    server_http_context::handler_t post_detokenize = [this](const server_http_request & req) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        const json body = json::parse(req.body);
+
+        std::string content;
+        if (body.count("tokens") != 0) {
+            const llama_tokens tokens = body.at("tokens");
+            content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
+        }
+
+        const json data = format_detokenized_response(content);
+        res->ok(data);
+        return res;
+    };
+
+    server_http_context::handler_t post_embeddings = [this](const server_http_request & req) {
+        return handle_embeddings_impl(req, OAICOMPAT_TYPE_NONE);
+    };
+
+    server_http_context::handler_t post_embeddings_oai = [this](const server_http_request & req) {
+        return handle_embeddings_impl(req, OAICOMPAT_TYPE_EMBEDDING);
+    };
+
+    server_http_context::handler_t post_rerank = [this](const server_http_request & req) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
+            res->error(format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        const json body = json::parse(req.body);
+
+        // if true, use TEI API format, otherwise use Jina API format
+        // Jina: https://jina.ai/reranker/
+        // TEI: https://huggingface.github.io/text-embeddings-inference/#/Text%20Embeddings%20Inference/rerank
+        bool is_tei_format = body.contains("texts");
+
+        json query;
+        if (body.count("query") == 1) {
+            query = body.at("query");
+            if (!query.is_string()) {
+                res->error(format_error_response("\"query\" must be a string", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        } else {
+            res->error(format_error_response("\"query\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        std::vector<std::string> documents = json_value(body, "documents",
+                                             json_value(body, "texts", std::vector<std::string>()));
+        if (documents.empty()) {
+            res->error(format_error_response("\"documents\" must be a non-empty string array", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        int top_n = json_value(body, "top_n", (int)documents.size());
+
+        // create and queue the task
+        json responses = json::array();
+        server_response_reader rd(ctx_server);
+        {
+            std::vector<server_task> tasks;
+            tasks.reserve(documents.size());
+            for (size_t i = 0; i < documents.size(); i++) {
+                auto tmp = format_rerank(ctx_server.model, ctx_server.vocab, ctx_server.mctx, query, documents[i]);
+                server_task task = server_task(SERVER_TASK_TYPE_RERANK);
+                task.id     = ctx_server.queue_tasks.get_new_id();
+                task.index  = i;
+                task.tokens = std::move(tmp);
+                tasks.push_back(std::move(task));
+            }
+            rd.post_tasks(std::move(tasks));
+        }
+
+        // wait for the results
+        auto all_results = rd.wait_for_all(req.should_stop);
+
+        // collect results
+        if (all_results.is_terminated) {
+            return res; // connection is closed
+        } else if (all_results.error) {
+            res->error(all_results.error->to_json());
+            return res;
+        } else {
+            for (auto & res : all_results.results) {
+                GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
+                responses.push_back(res->to_json());
+            }
+        }
+
+        // write JSON response
+        json root = format_response_rerank(
+            body,
+            responses,
+            is_tei_format,
+            documents,
+            top_n);
+
+        res->ok(root);
+        return res;
+    };
+
+    server_http_context::handler_t get_lora_adapters = [this](const server_http_request &) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        json result = json::array();
+        const auto & loras = ctx_server.params_base.lora_adapters;
+        for (size_t i = 0; i < loras.size(); ++i) {
+            auto & lora = loras[i];
+            json entry = {
+                {"id", i},
+                {"path", lora.path},
+                {"scale", lora.scale},
+                {"task_name", lora.task_name},
+                {"prompt_prefix", lora.prompt_prefix},
+            };
+            std::string alora_invocation_string = "";
+            const uint64_t n_alora_tokens = llama_adapter_get_alora_n_invocation_tokens(lora.ptr);
+            std::vector<llama_token> alora_invocation_tokens;
+            if (n_alora_tokens) {
+                const llama_token * alora_tokens = llama_adapter_get_alora_invocation_tokens(lora.ptr);
+                for (uint64_t i = 0; i < n_alora_tokens; ++i) {
+                    alora_invocation_string += common_token_to_piece(ctx_server.ctx, alora_tokens[i]);
+                    alora_invocation_tokens.push_back(alora_tokens[i]);
+                }
+                entry["alora_invocation_string"] = alora_invocation_string;
+                entry["alora_invocation_tokens"] = alora_invocation_tokens;
+            }
+            result.push_back(std::move(entry));
+        }
+        res->ok(result);
+        return res;
+    };
+
+    server_http_context::handler_t post_lora_adapters = [this](const server_http_request & req) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        const json body = json::parse(req.body);
+        if (!body.is_array()) {
+            res->error(format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_SET_LORA);
+            task.id = task_id;
+            task.set_lora = parse_lora_request(ctx_server.params_base.lora_adapters, body);
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task));
+        }
+
+        // get the result
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_apply_lora*>(result.get()) != nullptr);
+        res->ok(result->to_json());
+        return res;
+    };
+
 private:
-    server_http_resgen_ptr handle_completions_impl(
+    std::unique_ptr<server_resgen> handle_completions_impl(
                 server_task_type type,
                 const json & data,
                 const std::vector<raw_buffer> & files,
@@ -4573,7 +5186,8 @@ struct server_routes {
             }
 
             // next responses are streamed
-            res->ok(first_result->to_json());
+            res->data = format_sse(first_result->to_json()); // to be sent immediately
+            res->status = 200;
             res->content_type = "text/event-stream";
             res->next = [res_this = res.get(), oaicompat, &should_stop]() mutable -> bool {
                 if (should_stop()) {
@@ -4583,7 +5197,6 @@ struct server_routes {
 
                 server_response_reader & rd = res_this->rd;
                 std::string & output = res_this->data;
-                output.clear();
 
                 // receive subsequent results
                 auto result = rd.next(should_stop);
@@ -4595,7 +5208,7 @@ struct server_routes {
                 // send the results
                 json res_json = result->to_json();
                 if (result->is_error()) {
-                    output += format_sse(res_json);
+                    output = format_sse(res_json);
                     SRV_DBG("%s", "error received during streaming, terminating stream\n");
                     return false; // terminate on error
                 } else {
@@ -4603,7 +5216,7 @@ struct server_routes {
                         dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr
                         || dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
                     );
-                    output += format_sse(res_json);
+                    output = format_sse(res_json);
                 }
 
                 // check if there is more data
@@ -4622,6 +5235,202 @@ struct server_routes {
 
         return res;
     }
+
+    std::unique_ptr<server_resgen> handle_slots_save(const server_http_request & req, int id_slot) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        const json & request_data = req.body;
+        std::string filename = request_data.at("filename");
+        if (!fs_validate_filename(filename)) {
+            res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        std::string filepath = params.slot_save_path + filename;
+
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_SLOT_SAVE);
+            task.id = task_id;
+            task.slot_action.slot_id  = id_slot;
+            task.slot_action.filename = filename;
+            task.slot_action.filepath = filepath;
+
+            // TODO: use server_response_reader
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task));
+        }
+
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        res->ok(result->to_json());
+        return res;
+    }
+
+    std::unique_ptr<server_resgen> handle_slots_restore(const server_http_request & req, int id_slot) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        const json & request_data = req.body;
+        std::string filename = request_data.at("filename");
+        if (!fs_validate_filename(filename)) {
+            res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+        std::string filepath = params.slot_save_path + filename;
+
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_SLOT_RESTORE);
+            task.id = task_id;
+            task.slot_action.slot_id  = id_slot;
+            task.slot_action.filename = filename;
+            task.slot_action.filepath = filepath;
+
+            // TODO: use server_response_reader
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task));
+        }
+
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_slot_save_load*>(result.get()) != nullptr);
+        res->ok(result->to_json());
+        return res;
+    }
+
+    std::unique_ptr<server_resgen> handle_slots_erase(const server_http_request &, int id_slot) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        int task_id = ctx_server.queue_tasks.get_new_id();
+        {
+            server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
+            task.id = task_id;
+            task.slot_action.slot_id = id_slot;
+
+            // TODO: use server_response_reader
+            ctx_server.queue_results.add_waiting_task_id(task_id);
+            ctx_server.queue_tasks.post(std::move(task));
+        }
+
+        server_task_result_ptr result = ctx_server.queue_results.recv(task_id);
+        ctx_server.queue_results.remove_waiting_task_id(task_id);
+
+        if (result->is_error()) {
+            res->error(result->to_json());
+            return res;
+        }
+
+        GGML_ASSERT(dynamic_cast<server_task_result_slot_erase*>(result.get()) != nullptr);
+        res->ok(result->to_json());
+        return res;
+    }
+
+    std::unique_ptr<server_resgen> handle_embeddings_impl(const server_http_request & req, oaicompat_type oaicompat) {
+        auto res = std::make_unique<server_resgen>(ctx_server);
+        if (!ctx_server.params_base.embedding) {
+            res->error(format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+            return res;
+        }
+
+        if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+            res->error(format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        const json body = json::parse(req.body);
+
+        // for the shape of input/content, see tokenize_input_prompts()
+        json prompt;
+        if (body.count("input") != 0) {
+            prompt = body.at("input");
+        } else if (body.contains("content")) {
+            oaicompat = OAICOMPAT_TYPE_NONE; // "content" field is not OAI compatible
+            prompt = body.at("content");
+        } else {
+            res->error(format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
+            return res;
+        }
+
+        bool use_base64 = false;
+        if (body.count("encoding_format") != 0) {
+            const std::string& format = body.at("encoding_format");
+            if (format == "base64") {
+                use_base64 = true;
+            } else if (format != "float") {
+                res->error(format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        }
+
+        auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true);
+        for (const auto & tokens : tokenized_prompts) {
+            // this check is necessary for models that do not add BOS token to the input
+            if (tokens.empty()) {
+                res->error(format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST));
+                return res;
+            }
+        }
+
+        int embd_normalize = 2; // default to Euclidean/L2 norm
+        if (body.count("embd_normalize") != 0) {
+            embd_normalize = body.at("embd_normalize");
+            if (llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
+                SRV_DBG("embd_normalize is not supported by pooling type %d, ignoring it\n", llama_pooling_type(ctx_server.ctx));
+            }
+        }
+
+        // create and queue the task
+        json responses = json::array();
+        server_response_reader rd(ctx_server);
+        {
+            std::vector<server_task> tasks;
+            for (size_t i = 0; i < tokenized_prompts.size(); i++) {
+                server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING);
+
+                task.id     = ctx_server.queue_tasks.get_new_id();
+                task.index  = i;
+                task.tokens = std::move(tokenized_prompts[i]);
+
+                // OAI-compat
+                task.params.oaicompat = oaicompat;
+                task.params.embd_normalize = embd_normalize;
+
+                tasks.push_back(std::move(task));
+            }
+            rd.post_tasks(std::move(tasks));
+        }
+
+        // wait for the results
+        auto all_results = rd.wait_for_all(req.should_stop);
+
+        // collect results
+        if (all_results.is_terminated) {
+            return res; // connection is closed
+        } else if (all_results.error) {
+            res->error(all_results.error->to_json());
+            return res;
+        } else {
+            for (auto & res : all_results.results) {
+                GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
+                responses.push_back(res->to_json());
+            }
+        }
+
+        // write JSON response
+        json root = oaicompat == OAICOMPAT_TYPE_EMBEDDING
+            ? format_embeddings_response_oaicompat(body, responses, use_base64)
+            : json(responses);
+        res->ok(root);
+        return res;
+    }
 };
 
 std::function<void(int)> shutdown_handler;
@@ -4684,14 +5493,40 @@ int main(int argc, char ** argv) {
     //
 
     // register API routes
-    server_routes routes(ctx_server);
-    ctx_http.get ("/health",              routes.get_health);
+    server_routes routes(params, ctx_server, ctx_http);
+
+    ctx_http.get ("/health",              routes.get_health); // public endpoint (no API key check)
+    ctx_http.get ("/v1/health",           routes.get_health); // public endpoint (no API key check)
+    ctx_http.get ("/metrics",             routes.get_metrics);
+    ctx_http.get ("/props",               routes.get_props);
+    ctx_http.post("/props",               routes.post_props);
+    ctx_http.post("/api/show",            routes.get_api_show);
+    ctx_http.get ("/models",              routes.get_models); // public endpoint (no API key check)
+    ctx_http.get ("/v1/models",           routes.get_models); // public endpoint (no API key check)
+    ctx_http.get ("/api/tags",            routes.get_models); // ollama specific endpoint. public endpoint (no API key check)
     ctx_http.post("/completion",          routes.post_completions); // legacy
     ctx_http.post("/completions",         routes.post_completions);
     ctx_http.post("/v1/completions",      routes.post_completions_oai);
     ctx_http.post("/chat/completions",    routes.post_chat_completions);
     ctx_http.post("/v1/chat/completions", routes.post_chat_completions);
     ctx_http.post("/api/chat",            routes.post_chat_completions); // ollama specific endpoint
+    ctx_http.post("/infill",              routes.post_infill);
+    ctx_http.post("/embedding",           routes.post_embeddings); // legacy
+    ctx_http.post("/embeddings",          routes.post_embeddings);
+    ctx_http.post("/v1/embeddings",       routes.post_embeddings_oai);
+    ctx_http.post("/rerank",              routes.post_rerank);
+    ctx_http.post("/reranking",           routes.post_rerank);
+    ctx_http.post("/v1/rerank",           routes.post_rerank);
+    ctx_http.post("/v1/reranking",        routes.post_rerank);
+    ctx_http.post("/tokenize",            routes.post_tokenize);
+    ctx_http.post("/detokenize",          routes.post_detokenize);
+    ctx_http.post("/apply-template",      routes.post_apply_template);
+    // LoRA adapters hotswap
+    ctx_http.get ("/lora-adapters",       routes.get_lora_adapters);
+    ctx_http.post("/lora-adapters",       routes.post_lora_adapters);
+    // Save & load slots
+    ctx_http.get ("/slots",               routes.get_slots);
+    ctx_http.post("/slots/:id_slot",      routes.post_slots);
 
     //
     // Start the server

From a2e6a00371d0f860e21511f158693bf19759130d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 12 Nov 2025 22:28:51 +0100
Subject: [PATCH 04/21] fix exception/error handling

---
 tools/server/server-http.cpp | 31 +++++++------
 tools/server/server.cpp      | 89 +++++++++++++++++++++++-------------
 2 files changed, 74 insertions(+), 46 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 5b3b665f4b22f..d3fb13bb840be 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -68,6 +68,8 @@ bool server_http_context::init(const common_params & params) {
     srv->set_default_headers({{"Server", "llama.cpp"}});
     srv->set_logger(log_server_request);
     srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
+        // this is fail-safe; exceptions should already handled by `ex_wrapper`
+
         std::string message;
         try {
             std::rethrow_exception(ep);
@@ -77,26 +79,25 @@ bool server_http_context::init(const common_params & params) {
             message = "Unknown Exception";
         }
 
-        // FIXME
-        GGML_UNUSED(res);
-        GGML_UNUSED(message);
-        printf("Exception caught in HTTP server: %s\n", message.c_str());
-        // try {
-        //     json formatted_error = format_error_response(message, ERROR_TYPE_SERVER);
-        //     LOG_WRN("got exception: %s\n", formatted_error.dump().c_str());
-        //     res_error(res, formatted_error);
-        // } catch (const std::exception & e) {
-        //     LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
-        // }
+        res.status = 500;
+        res.set_content(message, "text/plain");
+        LOG_ERR("got exception: %s\n", message.c_str());
     });
 
     srv->set_error_handler([](const httplib::Request &, httplib::Response & res) {
         if (res.status == 404) {
-            // FIXME
-            //res_error(res, format_error_response("File Not Found", ERROR_TYPE_NOT_FOUND));
-            res.set_content("404 Not Found", "text/plain");
+            res.set_content(
+                (json {
+                    {"error", {
+                        {"message", "File Not Found"},
+                        {"type", "not_found_error"},
+                        {"code", 404}
+                    }}
+                }).dump(),
+                "application/json; charset=utf-8"
+            );
         }
-        // for other error codes, we skip processing here because it's already done by res_error()
+        // for other error codes, we skip processing here because it's already done by res->error()
     });
 
     // set timeouts and change hostname and port
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 1f4ca79278c7d..cee4a7fa0a96b 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4425,7 +4425,7 @@ struct server_resgen : server_http_resgen {
     }
     void error(const json & error_data) {
         status = json_value(error_data, "code", 500);
-        data = safe_json_to_str(error_data);
+        data = safe_json_to_str({{ "error", error_data }});
     }
 };
 
@@ -5447,6 +5447,33 @@ inline void signal_handler(int signal) {
     shutdown_handler(signal);
 }
 
+// wrapper function that handles exceptions and logs errors
+static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
+    return [func = std::move(func)](const server_http_request & req) -> server_http_resgen_ptr {
+        std::string message;
+        try {
+            return func(req);
+        } catch (const std::exception & e) {
+            message = e.what();
+        } catch (...) {
+            message = "unknown error";
+        }
+
+        auto res = std::make_unique<server_http_resgen>();
+        res->status = 500;
+        try {
+            json error_data = format_error_response(message, ERROR_TYPE_SERVER);
+            res->status = json_value(error_data, "code", 500);
+            res->data = safe_json_to_str({{ "error", error_data }});
+            LOG_WRN("got exception: %s\n", res->data.c_str());
+        } catch (const std::exception & e) {
+            LOG_ERR("got another exception: %s | while hanlding exception: %s\n", e.what(), message.c_str());
+            res->data = "Internal Server Error";
+        }
+        return res;
+    };
+}
+
 int main(int argc, char ** argv) {
     // own arguments required by this example
     common_params params;
@@ -5495,38 +5522,38 @@ int main(int argc, char ** argv) {
     // register API routes
     server_routes routes(params, ctx_server, ctx_http);
 
-    ctx_http.get ("/health",              routes.get_health); // public endpoint (no API key check)
-    ctx_http.get ("/v1/health",           routes.get_health); // public endpoint (no API key check)
-    ctx_http.get ("/metrics",             routes.get_metrics);
-    ctx_http.get ("/props",               routes.get_props);
-    ctx_http.post("/props",               routes.post_props);
-    ctx_http.post("/api/show",            routes.get_api_show);
-    ctx_http.get ("/models",              routes.get_models); // public endpoint (no API key check)
-    ctx_http.get ("/v1/models",           routes.get_models); // public endpoint (no API key check)
-    ctx_http.get ("/api/tags",            routes.get_models); // ollama specific endpoint. public endpoint (no API key check)
-    ctx_http.post("/completion",          routes.post_completions); // legacy
-    ctx_http.post("/completions",         routes.post_completions);
-    ctx_http.post("/v1/completions",      routes.post_completions_oai);
-    ctx_http.post("/chat/completions",    routes.post_chat_completions);
-    ctx_http.post("/v1/chat/completions", routes.post_chat_completions);
-    ctx_http.post("/api/chat",            routes.post_chat_completions); // ollama specific endpoint
-    ctx_http.post("/infill",              routes.post_infill);
-    ctx_http.post("/embedding",           routes.post_embeddings); // legacy
-    ctx_http.post("/embeddings",          routes.post_embeddings);
-    ctx_http.post("/v1/embeddings",       routes.post_embeddings_oai);
-    ctx_http.post("/rerank",              routes.post_rerank);
-    ctx_http.post("/reranking",           routes.post_rerank);
-    ctx_http.post("/v1/rerank",           routes.post_rerank);
-    ctx_http.post("/v1/reranking",        routes.post_rerank);
-    ctx_http.post("/tokenize",            routes.post_tokenize);
-    ctx_http.post("/detokenize",          routes.post_detokenize);
-    ctx_http.post("/apply-template",      routes.post_apply_template);
+    ctx_http.get ("/health",              ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+    ctx_http.get ("/v1/health",           ex_wrapper(routes.get_health)); // public endpoint (no API key check)
+    ctx_http.get ("/metrics",             ex_wrapper(routes.get_metrics));
+    ctx_http.get ("/props",               ex_wrapper(routes.get_props));
+    ctx_http.post("/props",               ex_wrapper(routes.post_props));
+    ctx_http.post("/api/show",            ex_wrapper(routes.get_api_show));
+    ctx_http.get ("/models",              ex_wrapper(routes.get_models)); // public endpoint (no API key check)
+    ctx_http.get ("/v1/models",           ex_wrapper(routes.get_models)); // public endpoint (no API key check)
+    ctx_http.get ("/api/tags",            ex_wrapper(routes.get_models)); // ollama specific endpoint. public endpoint (no API key check)
+    ctx_http.post("/completion",          ex_wrapper(routes.post_completions)); // legacy
+    ctx_http.post("/completions",         ex_wrapper(routes.post_completions));
+    ctx_http.post("/v1/completions",      ex_wrapper(routes.post_completions_oai));
+    ctx_http.post("/chat/completions",    ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/v1/chat/completions", ex_wrapper(routes.post_chat_completions));
+    ctx_http.post("/api/chat",            ex_wrapper(routes.post_chat_completions)); // ollama specific endpoint
+    ctx_http.post("/infill",              ex_wrapper(routes.post_infill));
+    ctx_http.post("/embedding",           ex_wrapper(routes.post_embeddings)); // legacy
+    ctx_http.post("/embeddings",          ex_wrapper(routes.post_embeddings));
+    ctx_http.post("/v1/embeddings",       ex_wrapper(routes.post_embeddings_oai));
+    ctx_http.post("/rerank",              ex_wrapper(routes.post_rerank));
+    ctx_http.post("/reranking",           ex_wrapper(routes.post_rerank));
+    ctx_http.post("/v1/rerank",           ex_wrapper(routes.post_rerank));
+    ctx_http.post("/v1/reranking",        ex_wrapper(routes.post_rerank));
+    ctx_http.post("/tokenize",            ex_wrapper(routes.post_tokenize));
+    ctx_http.post("/detokenize",          ex_wrapper(routes.post_detokenize));
+    ctx_http.post("/apply-template",      ex_wrapper(routes.post_apply_template));
     // LoRA adapters hotswap
-    ctx_http.get ("/lora-adapters",       routes.get_lora_adapters);
-    ctx_http.post("/lora-adapters",       routes.post_lora_adapters);
+    ctx_http.get ("/lora-adapters",       ex_wrapper(routes.get_lora_adapters));
+    ctx_http.post("/lora-adapters",       ex_wrapper(routes.post_lora_adapters));
     // Save & load slots
-    ctx_http.get ("/slots",               routes.get_slots);
-    ctx_http.post("/slots/:id_slot",      routes.post_slots);
+    ctx_http.get ("/slots",               ex_wrapper(routes.get_slots));
+    ctx_http.post("/slots/:id_slot",      ex_wrapper(routes.post_slots));
 
     //
     // Start the server

From 66c6fe27446062e3d9b84783a6053e13c95af93f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 12 Nov 2025 22:35:35 +0100
Subject: [PATCH 05/21] renaming

---
 tools/server/server-http.cpp |  6 +--
 tools/server/server-http.h   | 10 ++--
 tools/server/server.cpp      | 98 ++++++++++++++++++------------------
 3 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index d3fb13bb840be..ee25ec6c939be 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -306,7 +306,7 @@ static std::map<std::string, std::string> get_params(const httplib::Request & re
 
 void server_http_context::get(const std::string & path, server_http_context::handler_t handler) {
     pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
-        server_http_resgen_ptr response = handler(server_http_request{
+        server_http_res_ptr response = handler(server_http_req{
             get_params(req),
             req.body,
             req.is_connection_closed
@@ -320,7 +320,7 @@ void server_http_context::get(const std::string & path, server_http_context::han
 
 void server_http_context::post(const std::string & path, server_http_context::handler_t handler) {
     pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
-        server_http_resgen_ptr response = handler(server_http_request{
+        server_http_res_ptr response = handler(server_http_req{
             get_params(req),
             req.body,
             req.is_connection_closed
@@ -330,7 +330,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha
             set_headers(res, response->headers);
             std::string content_type = response->content_type;
             // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
-            std::shared_ptr<server_http_resgen> r_ptr = std::move(response);
+            std::shared_ptr<server_http_res> r_ptr = std::move(response);
             const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
                 // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
                 sink.write(response->data.data(), response->data.size());
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index f449bfedc27b0..b518586047eeb 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -12,7 +12,7 @@
 // 1) normal response: `data` contains the full response body
 // 2) streaming response: each call to next() generates the next chunk, stored in `data`
 //    when next() returns false, no more data after the current chunk
-struct server_http_resgen {
+struct server_http_res {
     std::string content_type = "application/json; charset=utf-8";
     int status = 200;
     std::string data;
@@ -24,14 +24,14 @@ struct server_http_resgen {
         return next != nullptr;
     }
 
-    virtual ~server_http_resgen() = default;
+    virtual ~server_http_res() = default;
 };
 
 // unique pointer, used by set_chunked_content_provider
 // we need to use unique_ptr because httplib requires the stream provider to be stored in heap
-using server_http_resgen_ptr = std::unique_ptr<server_http_resgen>;
+using server_http_res_ptr = std::unique_ptr<server_http_res>;
 
-struct server_http_request {
+struct server_http_req {
     std::map<std::string, std::string> params; // path_params + query_params
     std::string body;
     const std::function<bool()> & should_stop;
@@ -63,7 +63,7 @@ struct server_http_context {
     bool start();
     void stop();
 
-    using handler_t = std::function<server_http_resgen_ptr(const server_http_request & req)>;
+    using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
     void get(const std::string &, handler_t);
     void post(const std::string &, handler_t);
 };
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index cee4a7fa0a96b..6fdead9756ae6 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -4416,9 +4416,9 @@ struct server_response_reader {
 };
 
 // generator-like API for HTTP response generation
-struct server_resgen : server_http_resgen {
+struct server_res_generator : server_http_res {
     server_response_reader rd;
-    server_resgen(server_context & ctx_server_) : rd(ctx_server_) {}
+    server_res_generator(server_context & ctx_server_) : rd(ctx_server_) {}
     void ok(const json & response_data) {
         status = 200;
         data = safe_json_to_str(response_data);
@@ -4439,15 +4439,15 @@ struct server_routes {
 public:
     // handlers using lambda function, so that they can capture `this` without `std::bind`
 
-    server_http_context::handler_t get_health = [this](const server_http_request &) {
+    server_http_context::handler_t get_health = [this](const server_http_req &) {
         // error and loading states are handled by middleware
-        auto res = std::make_unique<server_resgen>(ctx_server);
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         res->ok({{"status", "ok"}});
         return res;
     };
 
-    server_http_context::handler_t get_metrics = [this](const server_http_request &) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t get_metrics = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         if (!params.endpoint_metrics) {
             res->error(format_error_response("This server does not support metrics endpoint. Start it with `--metrics`", ERROR_TYPE_NOT_SUPPORTED));
             return res;
@@ -4549,8 +4549,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t get_slots = [this](const server_http_request & req) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t get_slots = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         if (!params.endpoint_slots) {
             res->error(format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
             return res;
@@ -4590,8 +4590,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t post_slots = [this](const server_http_request & req) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t post_slots = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         if (params.slot_save_path.empty()) {
             res->error(format_error_response("This server does not support slots action. Start it with `--slot-save-path`", ERROR_TYPE_NOT_SUPPORTED));
             return res;
@@ -4621,8 +4621,8 @@ struct server_routes {
         }
     };
 
-    server_http_context::handler_t get_props = [this](const server_http_request &) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t get_props = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         json default_generation_settings_for_props;
 
         {
@@ -4665,8 +4665,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t post_props = [this](const server_http_request &) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t post_props = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         if (!params.endpoint_props) {
             res->error(format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
             return res;
@@ -4677,8 +4677,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t get_api_show = [this](const server_http_request &) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t get_api_show = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         bool has_mtmd = ctx_server.mctx != nullptr;
         json data = {
             {
@@ -4708,8 +4708,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t post_infill = [this](const server_http_request & req) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t post_infill = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         // check model compatibility
         std::string err;
         if (llama_vocab_fim_pre(ctx_server.vocab) == LLAMA_TOKEN_NULL) {
@@ -4786,7 +4786,7 @@ struct server_routes {
             OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
     };
 
-    server_http_context::handler_t post_completions = [this](const server_http_request & req) {
+    server_http_context::handler_t post_completions = [this](const server_http_req & req) {
         std::vector<raw_buffer> files; // dummy
         const json body = json::parse(req.body);
         return handle_completions_impl(
@@ -4797,7 +4797,7 @@ struct server_routes {
             OAICOMPAT_TYPE_NONE);
     };
 
-    server_http_context::handler_t post_completions_oai = [this](const server_http_request & req) {
+    server_http_context::handler_t post_completions_oai = [this](const server_http_req & req) {
         std::vector<raw_buffer> files; // dummy
         const json body = json::parse(req.body);
         return handle_completions_impl(
@@ -4808,7 +4808,7 @@ struct server_routes {
             OAICOMPAT_TYPE_COMPLETION);
     };
 
-    server_http_context::handler_t post_chat_completions = [this](const server_http_request & req) {
+    server_http_context::handler_t post_chat_completions = [this](const server_http_req & req) {
         std::vector<raw_buffer> files;
         json body = json::parse(req.body);
         json body_parsed = oaicompat_chat_params_parse(
@@ -4824,8 +4824,8 @@ struct server_routes {
     };
 
     // same with handle_chat_completions, but without inference part
-    server_http_context::handler_t post_apply_template = [this](const server_http_request & req) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t post_apply_template = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         std::vector<raw_buffer> files; // dummy, unused
         json body = json::parse(req.body);
         json data = oaicompat_chat_params_parse(
@@ -4836,8 +4836,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t get_models = [this](const server_http_request &) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t get_models = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         bool is_model_ready = ctx_http.is_ready.load();
         json model_meta = nullptr;
         if (is_model_ready) {
@@ -4883,8 +4883,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t post_tokenize = [this](const server_http_request & req) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t post_tokenize = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         const json body = json::parse(req.body);
         json tokens_response = json::array();
         if (body.count("content") != 0) {
@@ -4925,8 +4925,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t post_detokenize = [this](const server_http_request & req) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t post_detokenize = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         const json body = json::parse(req.body);
 
         std::string content;
@@ -4940,16 +4940,16 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t post_embeddings = [this](const server_http_request & req) {
+    server_http_context::handler_t post_embeddings = [this](const server_http_req & req) {
         return handle_embeddings_impl(req, OAICOMPAT_TYPE_NONE);
     };
 
-    server_http_context::handler_t post_embeddings_oai = [this](const server_http_request & req) {
+    server_http_context::handler_t post_embeddings_oai = [this](const server_http_req & req) {
         return handle_embeddings_impl(req, OAICOMPAT_TYPE_EMBEDDING);
     };
 
-    server_http_context::handler_t post_rerank = [this](const server_http_request & req) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t post_rerank = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
             res->error(format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
             return res;
@@ -5028,8 +5028,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t get_lora_adapters = [this](const server_http_request &) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t get_lora_adapters = [this](const server_http_req &) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         json result = json::array();
         const auto & loras = ctx_server.params_base.lora_adapters;
         for (size_t i = 0; i < loras.size(); ++i) {
@@ -5059,8 +5059,8 @@ struct server_routes {
         return res;
     };
 
-    server_http_context::handler_t post_lora_adapters = [this](const server_http_request & req) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    server_http_context::handler_t post_lora_adapters = [this](const server_http_req & req) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         const json body = json::parse(req.body);
         if (!body.is_array()) {
             res->error(format_error_response("Request body must be an array", ERROR_TYPE_INVALID_REQUEST));
@@ -5091,7 +5091,7 @@ struct server_routes {
     };
 
 private:
-    std::unique_ptr<server_resgen> handle_completions_impl(
+    std::unique_ptr<server_res_generator> handle_completions_impl(
                 server_task_type type,
                 const json & data,
                 const std::vector<raw_buffer> & files,
@@ -5099,7 +5099,7 @@ struct server_routes {
                 oaicompat_type oaicompat) {
         GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
 
-        auto res = std::make_unique<server_resgen>(ctx_server);
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         auto completion_id = gen_chatcmplid();
         auto & rd = res->rd;
 
@@ -5236,8 +5236,8 @@ struct server_routes {
         return res;
     }
 
-    std::unique_ptr<server_resgen> handle_slots_save(const server_http_request & req, int id_slot) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         const json & request_data = req.body;
         std::string filename = request_data.at("filename");
         if (!fs_validate_filename(filename)) {
@@ -5271,8 +5271,8 @@ struct server_routes {
         return res;
     }
 
-    std::unique_ptr<server_resgen> handle_slots_restore(const server_http_request & req, int id_slot) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         const json & request_data = req.body;
         std::string filename = request_data.at("filename");
         if (!fs_validate_filename(filename)) {
@@ -5307,8 +5307,8 @@ struct server_routes {
         return res;
     }
 
-    std::unique_ptr<server_resgen> handle_slots_erase(const server_http_request &, int id_slot) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    std::unique_ptr<server_res_generator> handle_slots_erase(const server_http_req &, int id_slot) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         int task_id = ctx_server.queue_tasks.get_new_id();
         {
             server_task task(SERVER_TASK_TYPE_SLOT_ERASE);
@@ -5333,8 +5333,8 @@ struct server_routes {
         return res;
     }
 
-    std::unique_ptr<server_resgen> handle_embeddings_impl(const server_http_request & req, oaicompat_type oaicompat) {
-        auto res = std::make_unique<server_resgen>(ctx_server);
+    std::unique_ptr<server_res_generator> handle_embeddings_impl(const server_http_req & req, oaicompat_type oaicompat) {
+        auto res = std::make_unique<server_res_generator>(ctx_server);
         if (!ctx_server.params_base.embedding) {
             res->error(format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return res;
@@ -5449,7 +5449,7 @@ inline void signal_handler(int signal) {
 
 // wrapper function that handles exceptions and logs errors
 static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
-    return [func = std::move(func)](const server_http_request & req) -> server_http_resgen_ptr {
+    return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
         std::string message;
         try {
             return func(req);
@@ -5459,7 +5459,7 @@ static server_http_context::handler_t ex_wrapper(server_http_context::handler_t
             message = "unknown error";
         }
 
-        auto res = std::make_unique<server_http_resgen>();
+        auto res = std::make_unique<server_http_res>();
         res->status = 500;
         try {
             json error_data = format_error_response(message, ERROR_TYPE_SERVER);

From 92a150f9eadf856ca3333a78fefa87cd1eafb2dd Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 12 Nov 2025 22:43:45 +0100
Subject: [PATCH 06/21] missing header

---
 tools/server/server-http.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index b518586047eeb..a29c566170a36 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -6,6 +6,7 @@
 #include <functional>
 #include <string>
 #include <thread>
+#include <atomic>
 
 // generator-like API for HTTP response generation
 // this object response with one of the 2 modes:

From d990534e774b7a29b190fc9e736d4f35b5265398 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 12 Nov 2025 23:22:43 +0100
Subject: [PATCH 07/21] fix missing windows header

---
 tools/server/server.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6fdead9756ae6..527801084cb13 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2,6 +2,13 @@
 #include "utils.hpp"
 #include "server-http.h"
 
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#endif
+
 #include "arg.h"
 #include "common.h"
 #include "json-schema-to-grammar.h"

From f428fe5841ea4c50233290d1c94268e13fd7f17e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 12 Nov 2025 23:54:12 +0100
Subject: [PATCH 08/21] fix error responses from http layer

---
 tools/server/server-http.cpp | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index ee25ec6c939be..8ed66215269e1 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -147,10 +147,17 @@ bool server_http_context::init(const common_params & params) {
         }
 
         // API key is invalid or not provided
-        //res_error(res, format_error_response("Invalid API Key", ERROR_TYPE_AUTHENTICATION));
-        // FIXME
         res.status = 401;
-        res.set_content("Unauthorized: Invalid API Key", "text/plain");
+        res.set_content(
+            (json {
+                {"error", {
+                    {"message", "Invalid API Key"},
+                    {"type", "authentication_error"},
+                    {"code", 401}
+                }}
+            }).dump(),
+            "application/json; charset=utf-8"
+        );
 
         LOG_WRN("Unauthorized: Invalid API Key\n");
 
@@ -168,10 +175,17 @@ bool server_http_context::init(const common_params & params) {
                 // allow the models endpoint to be accessed during loading
                 return true;
             } else {
-                // FIXME
-                //res_error(res, format_error_response("Loading model", ERROR_TYPE_UNAVAILABLE));
                 res.status = 503;
-                res.set_content("503 Service Unavailable: Loading model", "text/plain");
+                res.set_content(
+                    (json {
+                        {"error", {
+                            {"message", "Loading model"},
+                            {"type", "unavailable_error"},
+                            {"code", 503}
+                        }}
+                    }).dump(),
+                    "application/json; charset=utf-8"
+                );
             }
             return false;
         }

From 25cc7eb6fb3ba51674fb6cb350c3afd1ed22eaf9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 13 Nov 2025 00:00:08 +0100
Subject: [PATCH 09/21] fix slot save/restore handler

---
 tools/server/server.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 527801084cb13..6770dff940b0b 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5245,7 +5245,7 @@ struct server_routes {
 
     std::unique_ptr<server_res_generator> handle_slots_save(const server_http_req & req, int id_slot) {
         auto res = std::make_unique<server_res_generator>(ctx_server);
-        const json & request_data = req.body;
+        const json request_data = json::parse(req.body);
         std::string filename = request_data.at("filename");
         if (!fs_validate_filename(filename)) {
             res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
@@ -5280,7 +5280,7 @@ struct server_routes {
 
     std::unique_ptr<server_res_generator> handle_slots_restore(const server_http_req & req, int id_slot) {
         auto res = std::make_unique<server_res_generator>(ctx_server);
-        const json & request_data = req.body;
+        const json request_data = json::parse(req.body);
         std::string filename = request_data.at("filename");
         if (!fs_validate_filename(filename)) {
             res->error(format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));

From 3be8a3ac0c6c65e5b458d05d82c9889af603ffeb Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 13 Nov 2025 09:35:14 +0100
Subject: [PATCH 10/21] fix case where only one stream chunk is returned

---
 tools/server/server.cpp | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 6770dff940b0b..b555329878706 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5205,6 +5205,17 @@ struct server_routes {
                 server_response_reader & rd = res_this->rd;
                 std::string & output = res_this->data;
 
+                // check if there is more data
+                if (!rd.has_next()) {
+                    if (oaicompat != OAICOMPAT_TYPE_NONE) {
+                        output = "data: [DONE]\n\n";
+                    } else {
+                        output = "";
+                    }
+                    SRV_DBG("%s", "all results received, terminating stream\n");
+                    return false; // no more data, terminate
+                }
+
                 // receive subsequent results
                 auto result = rd.next(should_stop);
                 if (result == nullptr) {
@@ -5215,7 +5226,7 @@ struct server_routes {
                 // send the results
                 json res_json = result->to_json();
                 if (result->is_error()) {
-                    output = format_sse(res_json);
+                    output = format_sse(json {{ "error", res_json }});
                     SRV_DBG("%s", "error received during streaming, terminating stream\n");
                     return false; // terminate on error
                 } else {
@@ -5226,15 +5237,6 @@ struct server_routes {
                     output = format_sse(res_json);
                 }
 
-                // check if there is more data
-                if (!rd.has_next()) {
-                    if (oaicompat != OAICOMPAT_TYPE_NONE) {
-                        output += "data: [DONE]\n\n";
-                    }
-                    SRV_DBG("%s", "all results received, terminating stream\n");
-                    return false; // no more data, terminate
-                }
-
                 // has next data, continue
                 return true;
             };

From 9917e044f09e732a03cafeb11f713c29e7403de4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 13 Nov 2025 09:37:05 +0100
Subject: [PATCH 11/21] add NOMINMAX

---
 tools/server/server.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index b555329878706..dfd1fbc727b50 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2,9 +2,11 @@
 #include "utils.hpp"
 #include "server-http.h"
 
-#ifdef _WIN32
-#ifndef WIN32_LEAN_AND_MEAN
+// fix problem with std::min and std::max
+#if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
 #endif
 #include <windows.h>
 #endif

From fc35e9165106e30a25fb00e6fb8fabb6b438097e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 13 Nov 2025 10:08:13 +0100
Subject: [PATCH 12/21] do not call sink.write on empty data

---
 tools/server/server-http.cpp | 15 ++++++++++-----
 tools/server/server-http.h   |  1 +
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 8ed66215269e1..e11c5a6955aa8 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -347,12 +347,17 @@ void server_http_context::post(const std::string & path, server_http_context::ha
             std::shared_ptr<server_http_res> r_ptr = std::move(response);
             const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
                 // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
-                sink.write(response->data.data(), response->data.size());
-                SRV_DBG("http: streamed chunk: %s\n", response->data.c_str());
-                if (!response->next()) {
-                    // flush the remaining data
+                if (!response->data.empty()) {
                     sink.write(response->data.data(), response->data.size());
-                    SRV_DBG("http: streamed chunk (last): %s\n", response->data.c_str());
+                    SRV_DBG("http: streamed chunk: %s\n", response->data.c_str());
+                }
+                // get the next chunk
+                if (!response->next()) {
+                    // flush the remaining data before ending the stream
+                    if (!response->data.empty()) {
+                        sink.write(response->data.data(), response->data.size());
+                        SRV_DBG("http: streamed chunk (last): %s\n", response->data.c_str());
+                    }
                     SRV_DBG("%s", "http: stream ended\n");
                     sink.done();
                     return false; // end of stream
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index a29c566170a36..835a0c27098b9 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -13,6 +13,7 @@
 // 1) normal response: `data` contains the full response body
 // 2) streaming response: each call to next() generates the next chunk, stored in `data`
 //    when next() returns false, no more data after the current chunk
+//    note: some chunks can be empty, in which case no data is sent for that chunk
 struct server_http_res {
     std::string content_type = "application/json; charset=utf-8";
     int status = 200;

From 8c7fbec4a462cd672a0f13c5170d6dbe89fcbe47 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 13 Nov 2025 10:23:30 +0100
Subject: [PATCH 13/21] use safe_json_to_str for SSE

---
 tools/server/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index d828c922f26c9..9ad6a070e31ad 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -460,7 +460,7 @@ static std::string format_sse(const json & data) {
     std::ostringstream ss;
     auto send_single = [&ss](const json & data) {
         ss << "data: " <<
-            data.dump(-1, ' ', false, json::error_handler_t::replace) <<
+            safe_json_to_str(data) <<
             "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
     };
 

From da458d68d9104ca5c038cd84258e9ce8b4c15de3 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 13 Nov 2025 10:29:40 +0100
Subject: [PATCH 14/21] clean up

---
 tools/server/utils.hpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index 9ad6a070e31ad..bf21726051e55 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -19,9 +19,6 @@
 #include <memory>
 #include <cinttypes>
 
-#define JSON_ASSERT GGML_ASSERT
-#include <nlohmann/json.hpp>
-
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo"
 
 using json = nlohmann::ordered_json;
@@ -427,6 +424,10 @@ static std::string gen_tool_call_id() {
 // other common utils
 //
 
+static std::string safe_json_to_str(const json & data) {
+    return data.dump(-1, ' ', false, json::error_handler_t::replace);
+}
+
 // TODO: reuse llama_detokenize
 template <class Iter>
 static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
@@ -951,10 +952,6 @@ static json format_logit_bias(const std::vector<llama_logit_bias> & logit_bias)
     return data;
 }
 
-static std::string safe_json_to_str(const json & data) {
-    return data.dump(-1, ' ', false, json::error_handler_t::replace);
-}
-
 static std::vector<llama_token_data> get_token_probabilities(llama_context * ctx, int idx) {
     std::vector<llama_token_data> cur;
     const auto * logits = llama_get_logits_ith(ctx, idx);

From cd1047000b61678086247ea3218d472eb4dde194 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 13 Nov 2025 11:36:54 +0100
Subject: [PATCH 15/21] add some comments

---
 tools/server/server-http.cpp | 12 ++++++------
 tools/server/server-http.h   |  3 ++-
 tools/server/server.cpp      |  5 ++++-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index e11c5a6955aa8..7e49b9954a9a9 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -87,13 +87,13 @@ bool server_http_context::init(const common_params & params) {
     srv->set_error_handler([](const httplib::Request &, httplib::Response & res) {
         if (res.status == 404) {
             res.set_content(
-                (json {
+                safe_json_to_str(json {
                     {"error", {
                         {"message", "File Not Found"},
                         {"type", "not_found_error"},
                         {"code", 404}
                     }}
-                }).dump(),
+                }),
                 "application/json; charset=utf-8"
             );
         }
@@ -149,13 +149,13 @@ bool server_http_context::init(const common_params & params) {
         // API key is invalid or not provided
         res.status = 401;
         res.set_content(
-            (json {
+            safe_json_to_str(json {
                 {"error", {
                     {"message", "Invalid API Key"},
                     {"type", "authentication_error"},
                     {"code", 401}
                 }}
-            }).dump(),
+            }),
             "application/json; charset=utf-8"
         );
 
@@ -177,13 +177,13 @@ bool server_http_context::init(const common_params & params) {
             } else {
                 res.status = 503;
                 res.set_content(
-                    (json {
+                    safe_json_to_str(json {
                         {"error", {
                             {"message", "Loading model"},
                             {"type", "unavailable_error"},
                             {"code", 503}
                         }}
-                    }).dump(),
+                    }),
                     "application/json; charset=utf-8"
                 );
             }
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 835a0c27098b9..b2344b0c4d53f 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -30,7 +30,7 @@ struct server_http_res {
 };
 
 // unique pointer, used by set_chunked_content_provider
-// we need to use unique_ptr because httplib requires the stream provider to be stored in heap
+// httplib requires the stream provider to be stored in heap
 using server_http_res_ptr = std::unique_ptr<server_http_res>;
 
 struct server_http_req {
@@ -65,6 +65,7 @@ struct server_http_context {
     bool start();
     void stop();
 
+    // note: the handler should never throw exceptions
     using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
     void get(const std::string &, handler_t);
     void post(const std::string &, handler_t);
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index dfd1fbc727b50..b4149d093d9bb 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5459,6 +5459,7 @@ inline void signal_handler(int signal) {
 }
 
 // wrapper function that handles exceptions and logs errors
+// this is to make sure handler_t never throws exceptions; instead, it returns an error response
 static server_http_context::handler_t ex_wrapper(server_http_context::handler_t func) {
     return [func = std::move(func)](const server_http_req & req) -> server_http_res_ptr {
         std::string message;
@@ -5638,7 +5639,9 @@ int main(int argc, char ** argv) {
     ctx_server.queue_tasks.start_loop();
 
     clean_up();
-    ctx_http.thread.join();
+    if (ctx_http.thread.joinable()) {
+        ctx_http.thread.join();
+    }
     llama_memory_breakdown_print(ctx_server.ctx);
 
     return 0;

From 1bc41f60a841d6d62d555301a054e4710c9f945a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 14 Nov 2025 15:15:38 +0100
Subject: [PATCH 16/21] improve usage of next()

---
 tools/server/server-http.cpp | 23 +++++++++--------------
 tools/server/server-http.h   |  6 +++---
 tools/server/server.cpp      | 27 +++++++++++++++++----------
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 7e49b9954a9a9..8200dbc7a1719 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -346,23 +346,18 @@ void server_http_context::post(const std::string & path, server_http_context::ha
             // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
             std::shared_ptr<server_http_res> r_ptr = std::move(response);
             const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
-                // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
-                if (!response->data.empty()) {
-                    sink.write(response->data.data(), response->data.size());
-                    SRV_DBG("http: streamed chunk: %s\n", response->data.c_str());
+                std::string chunk;
+                bool has_next = response->next(chunk);
+                if (!chunk.empty()) {
+                    // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
+                    sink.write(chunk.data(), chunk.size());
+                    SRV_DBG("http: streamed chunk: %s\n", chunk.c_str());
                 }
-                // get the next chunk
-                if (!response->next()) {
-                    // flush the remaining data before ending the stream
-                    if (!response->data.empty()) {
-                        sink.write(response->data.data(), response->data.size());
-                        SRV_DBG("http: streamed chunk (last): %s\n", response->data.c_str());
-                    }
-                    SRV_DBG("%s", "http: stream ended\n");
+                if (!has_next) {
                     sink.done();
-                    return false; // end of stream
+                    SRV_DBG("%s", "http: stream ended\n");
                 }
-                return true; // more data, continue the loop
+                return has_next;
             };
             const auto on_complete = [response = r_ptr](bool) mutable {
                 response.reset(); // trigger the destruction of the response object
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index b2344b0c4d53f..71fd5c83808de 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -11,8 +11,8 @@
 // generator-like API for HTTP response generation
 // this object response with one of the 2 modes:
 // 1) normal response: `data` contains the full response body
-// 2) streaming response: each call to next() generates the next chunk, stored in `data`
-//    when next() returns false, no more data after the current chunk
+// 2) streaming response: each call to next(output) generates the next chunk
+//    when next(output) returns false, no more data after the current chunk
 //    note: some chunks can be empty, in which case no data is sent for that chunk
 struct server_http_res {
     std::string content_type = "application/json; charset=utf-8";
@@ -21,7 +21,7 @@ struct server_http_res {
     std::map<std::string, std::string> headers;
 
     // TODO: move this to a virtual function once we have proper polymorphism support
-    std::function<bool()> next = nullptr;
+    std::function<bool(std::string &)> next = nullptr;
     bool is_stream() const {
         return next != nullptr;
     }
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 1971e9777b1a2..9d41f16aec1d2 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5202,14 +5202,20 @@ struct server_routes {
             res->data = format_sse(first_result->to_json()); // to be sent immediately
             res->status = 200;
             res->content_type = "text/event-stream";
-            res->next = [res_this = res.get(), oaicompat, &should_stop]() mutable -> bool {
+            res->next = [res_this = res.get(), oaicompat, &should_stop](std::string & output) -> bool {
                 if (should_stop()) {
                     SRV_DBG("%s", "stopping streaming due to should_stop condition\n");
                     return false; // should_stop condition met
                 }
 
+                if (!res_this->data.empty()) {
+                    // flush the first chunk
+                    output = std::move(res_this->data);
+                    res_this->data.clear();
+                    return true;
+                }
+
                 server_response_reader & rd = res_this->rd;
-                std::string & output = res_this->data;
 
                 // check if there is more data
                 if (!rd.has_next()) {
@@ -5583,14 +5589,21 @@ int main(int argc, char ** argv) {
         llama_backend_free();
     };
 
-    LOG_INF("%s: HTTP server is listening, hostname: %s, port: %d, http threads: %d\n", __func__, params.hostname.c_str(), params.port, params.n_threads_http);
+    // start the HTTP server before loading the model to be able to serve /health requests
+    if (!ctx_http.start()) {
+        clean_up();
+        LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
+        return 1;
+    }
 
     // load the model
     LOG_INF("%s: loading model\n", __func__);
 
     if (!ctx_server.load_model(params)) {
         clean_up();
-        ctx_http.thread.join();
+        if (ctx_http.thread.joinable()) {
+            ctx_http.thread.join();
+        }
         LOG_ERR("%s: exiting due to model loading error\n", __func__);
         return 1;
     }
@@ -5632,12 +5645,6 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
-    if (!ctx_http.start()) {
-        clean_up();
-        LOG_ERR("%s: exiting due to HTTP server error\n", __func__);
-        return 1;
-    }
-
     LOG_INF("%s: starting the main loop...\n", __func__);
     // this call blocks the main thread until queue_tasks.terminate() is called
     ctx_server.queue_tasks.start_loop();

From 55ccf46b931088406eccd07b37647fa0e60e11f9 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 14 Nov 2025 18:17:40 +0100
Subject: [PATCH 17/21] bring back the "server is listening on" message

---
 tools/server/server-http.cpp | 5 ++---
 tools/server/server-http.h   | 3 +++
 tools/server/server.cpp      | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 8200dbc7a1719..9a4e3f16642d4 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -289,9 +289,8 @@ bool server_http_context::start() {
     thread = std::thread([this]() { pimpl->srv->listen_after_bind(); });
     srv->wait_until_ready();
 
-    LOG_INF("%s: server is listening on %s\n", __func__,
-            is_sock ? string_format("unix://%s",    hostname.c_str()).c_str() :
-                      string_format("http://%s:%d", hostname.c_str(), port).c_str());
+    listening_address = is_sock ? string_format("unix://%s",    hostname.c_str())
+                                : string_format("http://%s:%d", hostname.c_str(), port);
     return true;
 }
 
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 71fd5c83808de..39713c021b84e 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -69,4 +69,7 @@ struct server_http_context {
     using handler_t = std::function<server_http_res_ptr(const server_http_req & req)>;
     void get(const std::string &, handler_t);
     void post(const std::string &, handler_t);
+
+    // for debugging
+    std::string listening_address;
 };
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 9d41f16aec1d2..9f1a788aaf466 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -5645,6 +5645,7 @@ int main(int argc, char ** argv) {
     SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
 
+    LOG_INF("%s: server is listening on %s\n", __func__, ctx_http.listening_address.c_str());
     LOG_INF("%s: starting the main loop...\n", __func__);
     // this call blocks the main thread until queue_tasks.terminate() is called
     ctx_server.queue_tasks.start_loop();

From 4d37cee576e390597c188fab5d076cf6d30ec642 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 15 Nov 2025 21:17:36 +0100
Subject: [PATCH 18/21] more generic handler

---
 tools/server/server-http.cpp | 67 ++++++++++++++++++------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index 9a4e3f16642d4..b9e4f03b28727 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -317,6 +317,38 @@ static std::map<std::string, std::string> get_params(const httplib::Request & re
     return params;
 }
 
+static void process_handler_response(server_http_res_ptr & response, httplib::Response & res) {
+    if (response->is_stream()) {
+        res.status = response->status;
+        set_headers(res, response->headers);
+        std::string content_type = response->content_type;
+        // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
+        std::shared_ptr<server_http_res> r_ptr = std::move(response);
+        const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
+            std::string chunk;
+            bool has_next = response->next(chunk);
+            if (!chunk.empty()) {
+                // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
+                sink.write(chunk.data(), chunk.size());
+                SRV_DBG("http: streamed chunk: %s\n", chunk.c_str());
+            }
+            if (!has_next) {
+                sink.done();
+                SRV_DBG("%s", "http: stream ended\n");
+            }
+            return has_next;
+        };
+        const auto on_complete = [response = r_ptr](bool) mutable {
+            response.reset(); // trigger the destruction of the response object
+        };
+        res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
+    } else {
+        res.status = response->status;
+        set_headers(res, response->headers);
+        res.set_content(response->data, response->content_type);
+    }
+}
+
 void server_http_context::get(const std::string & path, server_http_context::handler_t handler) {
     pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
         server_http_res_ptr response = handler(server_http_req{
@@ -324,10 +356,7 @@ void server_http_context::get(const std::string & path, server_http_context::han
             req.body,
             req.is_connection_closed
         });
-        GGML_ASSERT(!response->is_stream() && "not supported for GET method");
-        res.status = response->status;
-        set_headers(res, response->headers);
-        res.set_content(response->data, response->content_type);
+        process_handler_response(response, res);
     });
 }
 
@@ -338,35 +367,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha
             req.body,
             req.is_connection_closed
         });
-        if (response->is_stream()) {
-            res.status = response->status;
-            set_headers(res, response->headers);
-            std::string content_type = response->content_type;
-            // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it
-            std::shared_ptr<server_http_res> r_ptr = std::move(response);
-            const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool {
-                std::string chunk;
-                bool has_next = response->next(chunk);
-                if (!chunk.empty()) {
-                    // TODO: maybe handle sink.write unsuccessful? for now, we rely on is_connection_closed()
-                    sink.write(chunk.data(), chunk.size());
-                    SRV_DBG("http: streamed chunk: %s\n", chunk.c_str());
-                }
-                if (!has_next) {
-                    sink.done();
-                    SRV_DBG("%s", "http: stream ended\n");
-                }
-                return has_next;
-            };
-            const auto on_complete = [response = r_ptr](bool) mutable {
-                response.reset(); // trigger the destruction of the response object
-            };
-            res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete);
-        } else {
-            res.status = response->status;
-            set_headers(res, response->headers);
-            res.set_content(response->data, response->content_type);
-        }
+        process_handler_response(response, res);
     });
 }
 

From 68d5c6f8c5faf23336c0954a17a34efcccb3f163 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 15 Nov 2025 21:29:28 +0100
Subject: [PATCH 19/21] add req.headers

---
 tools/server/server-http.cpp | 10 ++++++++++
 tools/server/server-http.h   |  1 +
 2 files changed, 11 insertions(+)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index b9e4f03b28727..e2f91467faf96 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -317,6 +317,14 @@ static std::map<std::string, std::string> get_params(const httplib::Request & re
     return params;
 }
 
+static std::map<std::string, std::string> get_headers(const httplib::Request & req) {
+    std::map<std::string, std::string> headers;
+    for (const auto & [key, value] : req.headers) {
+        headers[key] = value;
+    }
+    return headers;
+}
+
 static void process_handler_response(server_http_res_ptr & response, httplib::Response & res) {
     if (response->is_stream()) {
         res.status = response->status;
@@ -353,6 +361,7 @@ void server_http_context::get(const std::string & path, server_http_context::han
     pimpl->srv->Get(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
         server_http_res_ptr response = handler(server_http_req{
             get_params(req),
+            get_headers(req),
             req.body,
             req.is_connection_closed
         });
@@ -364,6 +373,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha
     pimpl->srv->Post(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) {
         server_http_res_ptr response = handler(server_http_req{
             get_params(req),
+            get_headers(req),
             req.body,
             req.is_connection_closed
         });
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 39713c021b84e..2e632e666d8b0 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -35,6 +35,7 @@ using server_http_res_ptr = std::unique_ptr<server_http_res>;
 
 struct server_http_req {
     std::map<std::string, std::string> params; // path_params + query_params
+    std::map<std::string, std::string> headers;
     std::string body;
     const std::function<bool()> & should_stop;
 

From 2c9fe91e23e94d5a12989e7cca4b39f68f13c36e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 15 Nov 2025 21:30:43 +0100
Subject: [PATCH 20/21] move the chat template print to init()

---
 tools/server/server.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 9f1a788aaf466..1c9e9a58d7daf 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -2605,6 +2605,11 @@ struct server_context {
             /* allow_audio           */ mctx ? mtmd_support_audio (mctx) : false,
             /* enable_thinking       */ enable_thinking,
         };
+
+        // print sample chat example to make it clear which template is used
+        LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
+            common_chat_templates_source(chat_templates.get()),
+            common_chat_format_example(chat_templates.get(), params_base.use_jinja, params_base.default_template_kwargs).c_str());
     }
 
     server_slot * get_slot_by_id(int id) {
@@ -5613,11 +5618,6 @@ int main(int argc, char ** argv) {
 
     LOG_INF("%s: model loaded\n", __func__);
 
-    // print sample chat example to make it clear which template is used
-    LOG_INF("%s: chat template, chat_template: %s, example_format: '%s'\n", __func__,
-        common_chat_templates_source(ctx_server.chat_templates.get()),
-        common_chat_format_example(ctx_server.chat_templates.get(), ctx_server.params_base.use_jinja, ctx_server.params_base.default_template_kwargs).c_str());
-
     ctx_server.queue_tasks.on_new_task([&ctx_server](server_task && task) {
         ctx_server.process_single_task(std::move(task));
     });

From 016f8b40e3fc92f10dc1305a0df74a29da0dd32d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 15 Nov 2025 22:05:40 +0100
Subject: [PATCH 21/21] add req.path

---
 tools/server/server-http.cpp | 2 ++
 tools/server/server-http.h   | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp
index e2f91467faf96..196ced443261a 100644
--- a/tools/server/server-http.cpp
+++ b/tools/server/server-http.cpp
@@ -362,6 +362,7 @@ void server_http_context::get(const std::string & path, server_http_context::han
         server_http_res_ptr response = handler(server_http_req{
             get_params(req),
             get_headers(req),
+            req.path,
             req.body,
             req.is_connection_closed
         });
@@ -374,6 +375,7 @@ void server_http_context::post(const std::string & path, server_http_context::ha
         server_http_res_ptr response = handler(server_http_req{
             get_params(req),
             get_headers(req),
+            req.path,
             req.body,
             req.is_connection_closed
         });
diff --git a/tools/server/server-http.h b/tools/server/server-http.h
index 2e632e666d8b0..dc6ca92fd8751 100644
--- a/tools/server/server-http.h
+++ b/tools/server/server-http.h
@@ -35,7 +35,8 @@ using server_http_res_ptr = std::unique_ptr<server_http_res>;
 
 struct server_http_req {
     std::map<std::string, std::string> params; // path_params + query_params
-    std::map<std::string, std::string> headers;
+    std::map<std::string, std::string> headers; // reserved for future use
+    std::string path; // reserved for future use
     std::string body;
     const std::function<bool()> & should_stop;