From 803031665afc6d0a7d7391693f045c4d0051eb95 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 11 Jan 2025 19:44:12 +0100
Subject: [PATCH 1/8] common : support tag-based hf_repo like on ollama

---
 common/arg.cpp    | 115 ++++++++++++++++++++++++++++++++++++++++++----
 common/common.cpp |   9 ++--
 common/common.h   |   5 ++
 3 files changed, 114 insertions(+), 15 deletions(-)
diff --git a/common/arg.cpp b/common/arg.cpp
index 27886b84e862c..112a0dc3ed3ed 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -13,6 +13,12 @@
 #include <thread>
 #include <vector>
 
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <future>
+#endif
+
 #include "json-schema-to-grammar.h"
 
 using json = nlohmann::ordered_json;
@@ -128,18 +134,105 @@ std::string common_arg::to_string() {
 // utils
 //
 
+#if defined(LLAMA_USE_CURL)
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to Q4_K_M if it exists
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ */
+static std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts[1] : "latest"; // "latest" means checking Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<repo>[:tag]\n");
+    }
+
+    // fetch model info from Hugging Face Hub API
+    json model_info;
+    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
+    std::unique_ptr<struct curl_slist, decltype(&curl_slist_free_all)> http_headers(nullptr, &curl_slist_free_all);
+    std::string res_str;
+    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (!hf_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.reset(curl_slist_append(http_headers.get(), auth_header.c_str()));
+        // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+        http_headers.reset(curl_slist_append(http_headers.get(), "User-Agent: llama-cpp"));
+        http_headers.reset(curl_slist_append(http_headers.get(), "Accept: application/json"));
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.get());
+    }
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        throw std::runtime_error("error: cannot make GET request to Hugging Face Hub API");
+    }
+
+    long res_code;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+    if (res_code == 200) {
+        model_info = json::parse(res_str);
+    } if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error: cannot get model info from Hugging Face Hub API, response code: %ld", res_code));
+    }
+
+    // check response
+    if (!model_info.contains("ggufFile")) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+    json & gguf_file = model_info.at("ggufFile");
+    if (!gguf_file.contains("rfilename")) {
+        throw std::runtime_error("error: ggufFile does not have rfilename");
+    }
+
+    // TODO handle error
+    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
+}
+#else
+static std::string common_get_hf_file(const std::string &, const std::string &) {
+    throw std::runtime_error("error: llama.cpp built without libcurl");
+}
+#endif
+
 static void common_params_handle_model_default(
         std::string & model,
-        std::string & model_url,
+        const std::string & model_url,
         std::string & hf_repo,
-        std::string & hf_file) {
+        std::string & hf_file,
+        const std::string & hf_token) {
     if (!hf_repo.empty()) {
         // short-hand to avoid specifying --hf-file -> default it to --model
         if (hf_file.empty()) {
             if (model.empty()) {
-                throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
+                try {
+                    auto auto_detected = common_get_hf_file(hf_repo, hf_token);
+                    hf_repo = auto_detected.first;
+                    hf_file = auto_detected.second;
+                    printf("%s: using hf_file = %s\n", __func__, hf_file.c_str());
+                } catch (std::exception & e) {
+                    fprintf(stderr, "%s: %s\n", __func__, e.what());
+                    exit(1);
+                }
+            } else {
+                hf_file = model;
             }
-            hf_file = model;
         } else if (model.empty()) {
             // this is to avoid different repo having same file name, or same file name in different subdirs
             std::string filename = hf_repo + "_" + hf_file;
@@ -290,8 +383,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
     }
 
     // TODO: refactor model params in a common struct
-    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file);
-    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file);
+    common_params_handle_model_default(params.model,         params.model_url,         params.hf_repo,         params.hf_file,         params.hf_token);
+    common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token);
 
     if (params.escape) {
         string_process_escapes(params.prompt);
@@ -1583,21 +1676,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_MODEL_URL"));
     add_opt(common_arg(
-        {"-hfr", "--hf-repo"}, "REPO",
-        "Hugging Face model repository (default: unused)",
+        {"-hf", "-hfr", "--hf-repo"}, "<repo>/<user>[:quant]",
+        "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
+        "example: unsloth/phi-4-GGUF:q4_k_m\n"
+        "(default: unused)",
         [](common_params & params, const std::string & value) {
             params.hf_repo = value;
         }
     ).set_env("LLAMA_ARG_HF_REPO"));
     add_opt(common_arg(
         {"-hff", "--hf-file"}, "FILE",
-        "Hugging Face model file (default: unused)",
+        "Hugging Face model file, unused if quant is already specified in --hf-repo (default: unused)",
         [](common_params & params, const std::string & value) {
             params.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
     add_opt(common_arg(
-        {"-hfrv", "--hf-repo-v"}, "REPO",
+        {"-hfv", "-hfrv", "--hf-repo-v"}, "<repo>/<user>[:quant]",
         "Hugging Face model repository for the vocoder model (default: unused)",
         [](common_params & params, const std::string & value) {
             params.vocoder.hf_repo = value;
diff --git a/common/common.cpp b/common/common.cpp
index 86e4e1e24edf9..dca7ddf69e0a5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1127,6 +1127,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
     // Initialize libcurl
     std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
+    std::unique_ptr<struct curl_slist, decltype(&curl_slist_free_all)> http_headers(nullptr, &curl_slist_free_all);
     if (!curl) {
         LOG_ERR("%s: error initializing libcurl\n", __func__);
         return false;
@@ -1140,11 +1141,9 @@ static bool common_download_file(const std::string & url, const std::string & pa
 
     // Check if hf-token or bearer-token was specified
     if (!hf_token.empty()) {
-      std::string auth_header = "Authorization: Bearer ";
-      auth_header += hf_token.c_str();
-      struct curl_slist *http_headers = NULL;
-      http_headers = curl_slist_append(http_headers, auth_header.c_str());
-      curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers);
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.reset(curl_slist_append(http_headers.get(), auth_header.c_str()));
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.get());
     }
 
 #if defined(_WIN32)
diff --git a/common/common.h b/common/common.h
index 0d452cf0f5ec3..42d75ef4bf7b8 100644
--- a/common/common.h
+++ b/common/common.h
@@ -454,6 +454,11 @@ static bool string_starts_with(const std::string & str,
     return str.rfind(prefix, 0) == 0;
 }
 
+static bool string_ends_with(const std::string & str,
+                               const std::string & suffix) {  // While we wait for C++20's std::string::ends_with...
+    return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0;
+}
+
 bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 void string_process_escapes(std::string & input);
 

From ef089ca1052247b6c6ed8ed1dc6b199e201a54ed Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 11 Jan 2025 20:35:10 +0100
Subject: [PATCH 2/8] fix build

---
 common/arg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 112a0dc3ed3ed..dcf89489e29c0 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -206,7 +206,7 @@ static std::pair<std::string, std::string> common_get_hf_file(const std::string
     return std::make_pair(hf_repo, gguf_file.at("rfilename"));
 }
 #else
-static std::string common_get_hf_file(const std::string &, const std::string &) {
+static std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
     throw std::runtime_error("error: llama.cpp built without libcurl");
 }
 #endif

From 242135eca42d7437ff200570cca9c07d46575012 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Sat, 11 Jan 2025 21:35:10 +0100
Subject: [PATCH 3/8] various fixes

---
 common/arg.cpp    | 40 +++++++++++++++++-----------------------
 common/common.cpp | 13 ++++---------
 common/common.h   | 24 ++++++++++++++++++++++++
 3 files changed, 45 insertions(+), 32 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index dcf89489e29c0..4a9d8ecd08c8b 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -13,12 +13,6 @@
 #include <thread>
 #include <vector>
 
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif
-
 #include "json-schema-to-grammar.h"
 
 using json = nlohmann::ordered_json;
@@ -140,21 +134,21 @@ std::string common_arg::to_string() {
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to Q4_K_M if it exists
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
  * Return pair of <repo, file> (with "repo" already having tag removed)
  */
 static std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
     auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts[1] : "latest"; // "latest" means checking Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
     std::string hf_repo = parts[0];
     if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<repo>[:tag]\n");
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
     }
 
     // fetch model info from Hugging Face Hub API
     json model_info;
-    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
-    std::unique_ptr<struct curl_slist, decltype(&curl_slist_free_all)> http_headers(nullptr, &curl_slist_free_all);
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
     std::string res_str;
     std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
     curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
@@ -171,26 +165,27 @@ static std::pair<std::string, std::string> common_get_hf_file(const std::string
 #endif
     if (!hf_token.empty()) {
         std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.reset(curl_slist_append(http_headers.get(), auth_header.c_str()));
-        // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-        http_headers.reset(curl_slist_append(http_headers.get(), "User-Agent: llama-cpp"));
-        http_headers.reset(curl_slist_append(http_headers.get(), "Accept: application/json"));
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.get());
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
     }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
     CURLcode res = curl_easy_perform(curl.get());
 
     if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to Hugging Face Hub API");
+        throw std::runtime_error("error: cannot make GET request to HF API");
     }
 
     long res_code;
     curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
     if (res_code == 200) {
         model_info = json::parse(res_str);
-    } if (res_code == 401) {
+    } else if (res_code == 401) {
         throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
     } else {
-        throw std::runtime_error(string_format("error: cannot get model info from Hugging Face Hub API, response code: %ld", res_code));
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
     }
 
     // check response
@@ -202,7 +197,6 @@ static std::pair<std::string, std::string> common_get_hf_file(const std::string
         throw std::runtime_error("error: ggufFile does not have rfilename");
     }
 
-    // TODO handle error
     return std::make_pair(hf_repo, gguf_file.at("rfilename"));
 }
 #else
@@ -1676,7 +1670,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_env("LLAMA_ARG_MODEL_URL"));
     add_opt(common_arg(
-        {"-hf", "-hfr", "--hf-repo"}, "<repo>/<user>[:quant]",
+        {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
         "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
         "example: unsloth/phi-4-GGUF:q4_k_m\n"
         "(default: unused)",
@@ -1686,13 +1680,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_env("LLAMA_ARG_HF_REPO"));
     add_opt(common_arg(
         {"-hff", "--hf-file"}, "FILE",
-        "Hugging Face model file, unused if quant is already specified in --hf-repo (default: unused)",
+        "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)",
         [](common_params & params, const std::string & value) {
             params.hf_file = value;
         }
     ).set_env("LLAMA_ARG_HF_FILE"));
     add_opt(common_arg(
-        {"-hfv", "-hfrv", "--hf-repo-v"}, "<repo>/<user>[:quant]",
+        {"-hfv", "-hfrv", "--hf-repo-v"}, "<user>/<model>[:quant]",
         "Hugging Face model repository for the vocoder model (default: unused)",
         [](common_params & params, const std::string & value) {
             params.vocoder.hf_repo = value;
diff --git a/common/common.cpp b/common/common.cpp
index dca7ddf69e0a5..b25ef2c5ed095 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -52,11 +52,6 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -1126,8 +1121,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
 
 static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
     // Initialize libcurl
-    std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
-    std::unique_ptr<struct curl_slist, decltype(&curl_slist_free_all)> http_headers(nullptr, &curl_slist_free_all);
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
     if (!curl) {
         LOG_ERR("%s: error initializing libcurl\n", __func__);
         return false;
@@ -1142,8 +1137,8 @@ static bool common_download_file(const std::string & url, const std::string & pa
     // Check if hf-token or bearer-token was specified
     if (!hf_token.empty()) {
         std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.reset(curl_slist_append(http_headers.get(), auth_header.c_str()));
-        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.get());
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+        curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
     }
 
 #if defined(_WIN32)
diff --git a/common/common.h b/common/common.h
index 42d75ef4bf7b8..a2c97cd51fe01 100644
--- a/common/common.h
+++ b/common/common.h
@@ -8,6 +8,12 @@
 #include <vector>
 #include <sstream>
 
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <future>
+#endif
+
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
@@ -651,4 +657,22 @@ const char * const LLM_KV_SPLIT_NO            = "split.no";
 const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
+#if defined(LLAMA_USE_CURL)
+//
+// CURL utils
+//
+
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
+#endif
+
 }

From d7b5bf8e9435b98c41c03873f29ce04c8877ded4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 11:44:38 +0100
Subject: [PATCH 4/8] small fixes

---
 common/arg.cpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 4a9d8ecd08c8b..74803c6a9af8d 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -135,7 +135,10 @@ std::string common_arg::to_string() {
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
  * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ * 
  * Return pair of <repo, file> (with "repo" already having tag removed)
+ * 
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
  */
 static std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
     auto parts = string_split<std::string>(hf_repo_with_tag, ':');
@@ -219,7 +222,7 @@ static void common_params_handle_model_default(
                     auto auto_detected = common_get_hf_file(hf_repo, hf_token);
                     hf_repo = auto_detected.first;
                     hf_file = auto_detected.second;
-                    printf("%s: using hf_file = %s\n", __func__, hf_file.c_str());
+                    LOG_INF("%s: using hf_file = %s\n", __func__, hf_file.c_str());
                 } catch (std::exception & e) {
                     fprintf(stderr, "%s: %s\n", __func__, e.what());
                     exit(1);
@@ -227,7 +230,9 @@ static void common_params_handle_model_default(
             } else {
                 hf_file = model;
             }
-        } else if (model.empty()) {
+        }
+        // make sure model path is present (for caching purposes)
+        if (model.empty()) {
             // this is to avoid different repo having same file name, or same file name in different subdirs
             std::string filename = hf_repo + "_" + hf_file;
             // to make sure we don't have any slashes in the filename

From ff484f77e3bba67239aa41c579c32588cbded88f Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 11:46:42 +0100
Subject: [PATCH 5/8] fix style

---
 common/arg.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 74803c6a9af8d..56d46ea2cf60e 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -135,9 +135,9 @@ std::string common_arg::to_string() {
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
  * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
  * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- * 
+ *
  * Return pair of <repo, file> (with "repo" already having tag removed)
- * 
+ *
  * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
  */
 static std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {

From 6ffb590e15a4852d42dda5d29af7be083f4854f8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 12:03:19 +0100
Subject: [PATCH 6/8] fix windows build?

---
 common/arg.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/common/arg.cpp b/common/arg.cpp
index 56d46ea2cf60e..d8075171b4d72 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3,6 +3,10 @@
 #include "log.h"
 #include "sampling.h"
 
+#ifdef _WIN32
+#define NOMINMAX
+#endif
+
 #include <algorithm>
 #include <climits>
 #include <cstdarg>

From 22927b1c0a76ef4cb34cb80432cd2e5352a77b4e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 12:08:18 +0100
Subject: [PATCH 7/8] move common_get_hf_file to common.cpp

---
 common/arg.cpp    | 84 ----------------------------------------
 common/common.cpp | 99 +++++++++++++++++++++++++++++++++++++++++++++++
 common/common.h   | 27 ++-----------
 3 files changed, 102 insertions(+), 108 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index d8075171b4d72..8c7dcc75151b3 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -3,10 +3,6 @@
 #include "log.h"
 #include "sampling.h"
 
-#ifdef _WIN32
-#define NOMINMAX
-#endif
-
 #include <algorithm>
 #include <climits>
 #include <cstdarg>
@@ -132,86 +128,6 @@ std::string common_arg::to_string() {
 // utils
 //
 
-#if defined(LLAMA_USE_CURL)
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-static std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    // fetch model info from Hugging Face Hub API
-    json model_info;
-    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
-    curl_slist_ptr http_headers;
-    std::string res_str;
-    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
-    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
-    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
-    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
-    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
-        return size * nmemb;
-    };
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
-#if defined(_WIN32)
-    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
-#endif
-    if (!hf_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + hf_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
-    }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
-    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
-
-    CURLcode res = curl_easy_perform(curl.get());
-
-    if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to HF API");
-    }
-
-    long res_code;
-    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
-    if (res_code == 200) {
-        model_info = json::parse(res_str);
-    } else if (res_code == 401) {
-        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
-    } else {
-        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
-    }
-
-    // check response
-    if (!model_info.contains("ggufFile")) {
-        throw std::runtime_error("error: model does not have ggufFile");
-    }
-    json & gguf_file = model_info.at("ggufFile");
-    if (!gguf_file.contains("rfilename")) {
-        throw std::runtime_error("error: ggufFile does not have rfilename");
-    }
-
-    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
-}
-#else
-static std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
-    throw std::runtime_error("error: llama.cpp built without libcurl");
-}
-#endif
-
 static void common_params_handle_model_default(
         std::string & model,
         const std::string & model_url,
diff --git a/common/common.cpp b/common/common.cpp
index e7860faeec143..c0a52d8d195a8 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -52,6 +52,11 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#include <curl/easy.h>
+#include <future>
+#endif
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -68,6 +73,22 @@
 #include <sys/syslimits.h>
 #endif
 #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083
+
+//
+// CURL utils
+//
+
+using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
+
+// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
+struct curl_slist_ptr {
+    struct curl_slist * ptr = nullptr;
+    ~curl_slist_ptr() {
+        if (ptr) {
+            curl_slist_free_all(ptr);
+        }
+    }
+};
 #endif // LLAMA_USE_CURL
 
 using json = nlohmann::ordered_json;
@@ -1438,6 +1459,80 @@ struct llama_model * common_load_model_from_hf(
     return common_load_model_from_url(model_url, local_path, hf_token, params);
 }
 
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+std::pair<std::string, std::string> common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+
+    // fetch model info from Hugging Face Hub API
+    json model_info;
+    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
+    curl_slist_ptr http_headers;
+    std::string res_str;
+    std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag;
+    curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
+    curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
+    auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
+        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        return size * nmemb;
+    };
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+#if defined(_WIN32)
+    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
+    if (!hf_token.empty()) {
+        std::string auth_header = "Authorization: Bearer " + hf_token;
+        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
+    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
+
+    CURLcode res = curl_easy_perform(curl.get());
+
+    if (res != CURLE_OK) {
+        throw std::runtime_error("error: cannot make GET request to HF API");
+    }
+
+    long res_code;
+    curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+    if (res_code == 200) {
+        model_info = json::parse(res_str);
+    } else if (res_code == 401) {
+        throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token");
+    } else {
+        throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str()));
+    }
+
+    // check response
+    if (!model_info.contains("ggufFile")) {
+        throw std::runtime_error("error: model does not have ggufFile");
+    }
+    json & gguf_file = model_info.at("ggufFile");
+    if (!gguf_file.contains("rfilename")) {
+        throw std::runtime_error("error: ggufFile does not have rfilename");
+    }
+
+    return std::make_pair(hf_repo, gguf_file.at("rfilename"));
+}
+
 #else
 
 struct llama_model * common_load_model_from_url(
@@ -1459,6 +1554,10 @@ struct llama_model * common_load_model_from_hf(
     return nullptr;
 }
 
+std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
+    throw std::runtime_error("error: llama.cpp built without libcurl, downloading from Hugging Face not supported.");
+}
+
 #endif // LLAMA_USE_CURL
 
 //
diff --git a/common/common.h b/common/common.h
index 3d0da54d7dd84..c86a4ef39212b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -8,12 +8,6 @@
 #include <vector>
 #include <sstream>
 
-#if defined(LLAMA_USE_CURL)
-#include <curl/curl.h>
-#include <curl/easy.h>
-#include <future>
-#endif
-
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
@@ -512,6 +506,9 @@ struct llama_model * common_load_model_from_hf(
     const std::string & local_path,
     const std::string & hf_token,
     const struct llama_model_params & params);
+std::pair<std::string, std::string> common_get_hf_file(
+    const std::string & hf_repo_with_tag,
+    const std::string & hf_token);
 
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
@@ -667,22 +664,4 @@ const char * const LLM_KV_SPLIT_NO            = "split.no";
 const char * const LLM_KV_SPLIT_COUNT         = "split.count";
 const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
 
-#if defined(LLAMA_USE_CURL)
-//
-// CURL utils
-//
-
-using curl_ptr = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one
-struct curl_slist_ptr {
-    struct curl_slist * ptr = nullptr;
-    ~curl_slist_ptr() {
-        if (ptr) {
-            curl_slist_free_all(ptr);
-        }
-    }
-};
-#endif
-
 }

From 8bd5b18ce1d6ed3b2a15615b59c88acc95e1ad4c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 12:45:32 +0100
Subject: [PATCH 8/8] fix complain with noreturn

---
 common/arg.cpp    | 13 +++++--------
 common/common.cpp |  3 ++-
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 8c7dcc75151b3..1457a360faab2 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -138,15 +138,12 @@ static void common_params_handle_model_default(
         // short-hand to avoid specifying --hf-file -> default it to --model
         if (hf_file.empty()) {
             if (model.empty()) {
-                try {
-                    auto auto_detected = common_get_hf_file(hf_repo, hf_token);
-                    hf_repo = auto_detected.first;
-                    hf_file = auto_detected.second;
-                    LOG_INF("%s: using hf_file = %s\n", __func__, hf_file.c_str());
-                } catch (std::exception & e) {
-                    fprintf(stderr, "%s: %s\n", __func__, e.what());
-                    exit(1);
+                auto auto_detected = common_get_hf_file(hf_repo, hf_token);
+                if (auto_detected.first.empty() || auto_detected.second.empty()) {
+                    exit(1); // built without CURL, error message already printed
                 }
+                hf_repo = auto_detected.first;
+                hf_file = auto_detected.second;
             } else {
                 hf_file = model;
             }
diff --git a/common/common.cpp b/common/common.cpp
index c0a52d8d195a8..a6f9252b27a9f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1555,7 +1555,8 @@ struct llama_model * common_load_model_from_hf(
 }
 
 std::pair<std::string, std::string> common_get_hf_file(const std::string &, const std::string &) {
-    throw std::runtime_error("error: llama.cpp built without libcurl, downloading from Hugging Face not supported.");
+    LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__);
+    return std::make_pair("", "");
 }
 
 #endif // LLAMA_USE_CURL