From 803031665afc6d0a7d7391693f045c4d0051eb95 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 11 Jan 2025 19:44:12 +0100 Subject: [PATCH 1/8] common : support tag-based hf_repo like on ollama --- common/arg.cpp | 115 ++++++++++++++++++++++++++++++++++++++++++---- common/common.cpp | 9 ++-- common/common.h | 5 ++ 3 files changed, 114 insertions(+), 15 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 27886b84e862c..112a0dc3ed3ed 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -13,6 +13,12 @@ #include #include +#if defined(LLAMA_USE_CURL) +#include +#include +#include +#endif + #include "json-schema-to-grammar.h" using json = nlohmann::ordered_json; @@ -128,18 +134,105 @@ std::string common_arg::to_string() { // utils // +#if defined(LLAMA_USE_CURL) +/** + * Allow getting the HF file from the HF repo with tag (like ollama), for example: + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 + * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s + * Tag is optional, default to Q4_K_M if it exists + * Return pair of (with "repo" already having tag removed) + */ +static std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { + auto parts = string_split(hf_repo_with_tag, ':'); + std::string tag = parts.size() > 1 ? parts[1] : "latest"; // "latest" means checking Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo + std::string hf_repo = parts[0]; + if (string_split(hf_repo, '/').size() != 2) { + throw std::invalid_argument("error: invalid HF repo format, expected /[:tag]\n"); + } + + // fetch model info from Hugging Face Hub API + json model_info; + std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); + std::unique_ptr http_headers(nullptr, &curl_slist_free_all); + std::string res_str; + std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); + auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { + static_cast(data)->append((char * ) ptr, size * nmemb); + return size * nmemb; + }; + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); +#if defined(_WIN32) + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + if (!hf_token.empty()) { + std::string auth_header = "Authorization: Bearer " + hf_token; + http_headers.reset(curl_slist_append(http_headers.get(), auth_header.c_str())); + // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response + http_headers.reset(curl_slist_append(http_headers.get(), "User-Agent: llama-cpp")); + http_headers.reset(curl_slist_append(http_headers.get(), "Accept: application/json")); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.get()); + } + CURLcode res = curl_easy_perform(curl.get()); + + if (res != CURLE_OK) { + throw std::runtime_error("error: cannot make GET request to Hugging Face Hub API"); + } + + long res_code; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); + if (res_code == 200) { + model_info = json::parse(res_str); + } if (res_code == 401) { + throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); + } else { + throw std::runtime_error(string_format("error: cannot get model info from Hugging Face Hub API, response code: %ld", res_code)); + } + + // check response + if (!model_info.contains("ggufFile")) { + throw std::runtime_error("error: model does not have ggufFile"); + } + json & gguf_file = model_info.at("ggufFile"); + if (!gguf_file.contains("rfilename")) { + throw std::runtime_error("error: ggufFile does not have rfilename"); + } + + // TODO handle error + return std::make_pair(hf_repo, gguf_file.at("rfilename")); +} +#else +static std::string common_get_hf_file(const std::string &, const std::string &) { + throw std::runtime_error("error: llama.cpp built without libcurl"); +} +#endif + static void common_params_handle_model_default( std::string & model, - std::string & model_url, + const std::string & model_url, std::string & hf_repo, - std::string & hf_file) { + std::string & hf_file, + const std::string & hf_token) { if (!hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model if (hf_file.empty()) { if (model.empty()) { - throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + try { + auto auto_detected = common_get_hf_file(hf_repo, hf_token); + hf_repo = auto_detected.first; + hf_file = auto_detected.second; + printf("%s: using hf_file = %s\n", __func__, hf_file.c_str()); + } catch (std::exception & e) { + fprintf(stderr, "%s: %s\n", __func__, e.what()); + exit(1); + } + } else { + hf_file = model; } - hf_file = model; } else if (model.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs std::string filename = hf_repo + "_" + hf_file; @@ -290,8 +383,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context } // TODO: refactor model params in a common struct - common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file); - common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file); + common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token); + common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token); if (params.escape) { string_process_escapes(params.prompt); @@ -1583,21 +1676,23 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( - {"-hfr", "--hf-repo"}, "REPO", - "Hugging Face model repository (default: unused)", + {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", + "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" + "example: unsloth/phi-4-GGUF:q4_k_m\n" + "(default: unused)", [](common_params & params, const std::string & value) { params.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", - "Hugging Face model file (default: unused)", + "Hugging Face model file, unused if quant is already specified in --hf-repo (default: unused)", [](common_params & params, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( - {"-hfrv", "--hf-repo-v"}, "REPO", + {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", "Hugging Face model repository for the vocoder model (default: unused)", [](common_params & params, const std::string & value) { params.vocoder.hf_repo = value; diff --git a/common/common.cpp b/common/common.cpp index 86e4e1e24edf9..dca7ddf69e0a5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1127,6 +1127,7 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { // Initialize libcurl std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); + std::unique_ptr http_headers(nullptr, &curl_slist_free_all); if (!curl) { LOG_ERR("%s: error initializing libcurl\n", __func__); return false; @@ -1140,11 +1141,9 @@ static bool common_download_file(const std::string & url, const std::string & pa // Check if hf-token or bearer-token was specified if (!hf_token.empty()) { - std::string auth_header = "Authorization: Bearer "; - auth_header += hf_token.c_str(); - struct curl_slist *http_headers = NULL; - http_headers = curl_slist_append(http_headers, auth_header.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers); + std::string auth_header = "Authorization: Bearer " + hf_token; + http_headers.reset(curl_slist_append(http_headers.get(), auth_header.c_str())); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.get()); } #if defined(_WIN32) diff --git a/common/common.h b/common/common.h index 0d452cf0f5ec3..42d75ef4bf7b8 100644 --- a/common/common.h +++ b/common/common.h @@ -454,6 +454,11 @@ static bool string_starts_with(const std::string & str, return str.rfind(prefix, 0) == 0; } +static bool string_ends_with(const std::string & str, + const std::string & suffix) { // While we wait for C++20's std::string::ends_with... + return str.size() >= suffix.size() && str.compare(str.size()-suffix.size(), suffix.size(), suffix) == 0; +} + bool string_parse_kv_override(const char * data, std::vector & overrides); void string_process_escapes(std::string & input); From ef089ca1052247b6c6ed8ed1dc6b199e201a54ed Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 11 Jan 2025 20:35:10 +0100 Subject: [PATCH 2/8] fix build --- common/arg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 112a0dc3ed3ed..dcf89489e29c0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -206,7 +206,7 @@ static std::pair common_get_hf_file(const std::string return std::make_pair(hf_repo, gguf_file.at("rfilename")); } #else -static std::string common_get_hf_file(const std::string &, const std::string &) { +static std::pair common_get_hf_file(const std::string &, const std::string &) { throw std::runtime_error("error: llama.cpp built without libcurl"); } #endif From 242135eca42d7437ff200570cca9c07d46575012 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 11 Jan 2025 21:35:10 +0100 Subject: [PATCH 3/8] various fixes --- common/arg.cpp | 40 +++++++++++++++++----------------------- common/common.cpp | 13 ++++--------- common/common.h | 24 ++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 32 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index dcf89489e29c0..4a9d8ecd08c8b 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -13,12 +13,6 @@ #include #include -#if defined(LLAMA_USE_CURL) -#include -#include -#include -#endif - #include "json-schema-to-grammar.h" using json = nlohmann::ordered_json; @@ -140,21 +134,21 @@ std::string common_arg::to_string() { * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s - * Tag is optional, default to Q4_K_M if it exists + * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) * Return pair of (with "repo" already having tag removed) */ static std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { auto parts = string_split(hf_repo_with_tag, ':'); - std::string tag = parts.size() > 1 ? parts[1] : "latest"; // "latest" means checking Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo + std::string tag = parts.size() > 1 ? parts.back() : "latest"; std::string hf_repo = parts[0]; if (string_split(hf_repo, '/').size() != 2) { - throw std::invalid_argument("error: invalid HF repo format, expected /[:tag]\n"); + throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); } // fetch model info from Hugging Face Hub API json model_info; - std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); - std::unique_ptr http_headers(nullptr, &curl_slist_free_all); + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; std::string res_str; std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); @@ -171,26 +165,27 @@ static std::pair common_get_hf_file(const std::string #endif if (!hf_token.empty()) { std::string auth_header = "Authorization: Bearer " + hf_token; - http_headers.reset(curl_slist_append(http_headers.get(), auth_header.c_str())); - // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response - http_headers.reset(curl_slist_append(http_headers.get(), "User-Agent: llama-cpp")); - http_headers.reset(curl_slist_append(http_headers.get(), "Accept: application/json")); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.get()); + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); } + // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response + http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); + http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + CURLcode res = curl_easy_perform(curl.get()); if (res != CURLE_OK) { - throw std::runtime_error("error: cannot make GET request to Hugging Face Hub API"); + throw std::runtime_error("error: cannot make GET request to HF API"); } long res_code; curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); if (res_code == 200) { model_info = json::parse(res_str); - } if (res_code == 401) { + } else if (res_code == 401) { throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); } else { - throw std::runtime_error(string_format("error: cannot get model info from Hugging Face Hub API, response code: %ld", res_code)); + throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); } // check response @@ -202,7 +197,6 @@ static std::pair common_get_hf_file(const std::string throw std::runtime_error("error: ggufFile does not have rfilename"); } - // TODO handle error return std::make_pair(hf_repo, gguf_file.at("rfilename")); } #else @@ -1676,7 +1670,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( - {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", + {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" "example: unsloth/phi-4-GGUF:q4_k_m\n" "(default: unused)", @@ -1686,13 +1680,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", - "Hugging Face model file, unused if quant is already specified in --hf-repo (default: unused)", + "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", [](common_params & params, const std::string & value) { params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( - {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", + {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", "Hugging Face model repository for the vocoder model (default: unused)", [](common_params & params, const std::string & value) { params.vocoder.hf_repo = value; diff --git a/common/common.cpp b/common/common.cpp index dca7ddf69e0a5..b25ef2c5ed095 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -52,11 +52,6 @@ #include #include #endif -#if defined(LLAMA_USE_CURL) -#include -#include -#include -#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -1126,8 +1121,8 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { // Initialize libcurl - std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); - std::unique_ptr http_headers(nullptr, &curl_slist_free_all); + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; if (!curl) { LOG_ERR("%s: error initializing libcurl\n", __func__); return false; @@ -1142,8 +1137,8 @@ static bool common_download_file(const std::string & url, const std::string & pa // Check if hf-token or bearer-token was specified if (!hf_token.empty()) { std::string auth_header = "Authorization: Bearer " + hf_token; - http_headers.reset(curl_slist_append(http_headers.get(), auth_header.c_str())); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.get()); + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); } #if defined(_WIN32) diff --git a/common/common.h b/common/common.h index 42d75ef4bf7b8..a2c97cd51fe01 100644 --- a/common/common.h +++ b/common/common.h @@ -8,6 +8,12 @@ #include #include +#if defined(LLAMA_USE_CURL) +#include +#include +#include +#endif + #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' #else @@ -651,4 +657,22 @@ const char * const LLM_KV_SPLIT_NO = "split.no"; const char * const LLM_KV_SPLIT_COUNT = "split.count"; const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; +#if defined(LLAMA_USE_CURL) +// +// CURL utils +// + +using curl_ptr = std::unique_ptr; + +// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one +struct curl_slist_ptr { + struct curl_slist * ptr = nullptr; + ~curl_slist_ptr() { + if (ptr) { + curl_slist_free_all(ptr); + } + } +}; +#endif + } From d7b5bf8e9435b98c41c03873f29ce04c8877ded4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 13 Jan 2025 11:44:38 +0100 Subject: [PATCH 4/8] small fixes --- common/arg.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 4a9d8ecd08c8b..74803c6a9af8d 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -135,7 +135,10 @@ std::string common_arg::to_string() { * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) + * * Return pair of (with "repo" already having tag removed) + * + * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. */ static std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { auto parts = string_split(hf_repo_with_tag, ':'); @@ -219,7 +222,7 @@ static void common_params_handle_model_default( auto auto_detected = common_get_hf_file(hf_repo, hf_token); hf_repo = auto_detected.first; hf_file = auto_detected.second; - printf("%s: using hf_file = %s\n", __func__, hf_file.c_str()); + LOG_INF("%s: using hf_file = %s\n", __func__, hf_file.c_str()); } catch (std::exception & e) { fprintf(stderr, "%s: %s\n", __func__, e.what()); exit(1); @@ -227,7 +230,9 @@ static void common_params_handle_model_default( } else { hf_file = model; } - } else if (model.empty()) { + } + // make sure model path is present (for caching purposes) + if (model.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs std::string filename = hf_repo + "_" + hf_file; // to make sure we don't have any slashes in the filename From ff484f77e3bba67239aa41c579c32588cbded88f Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 13 Jan 2025 11:46:42 +0100 Subject: [PATCH 5/8] fix style --- common/arg.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 74803c6a9af8d..56d46ea2cf60e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -135,9 +135,9 @@ std::string common_arg::to_string() { * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) - * + * * Return pair of (with "repo" already having tag removed) - * + * * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. */ static std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { From 6ffb590e15a4852d42dda5d29af7be083f4854f8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 13 Jan 2025 12:03:19 +0100 Subject: [PATCH 6/8] fix windows build? --- common/arg.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 56d46ea2cf60e..d8075171b4d72 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3,6 +3,10 @@ #include "log.h" #include "sampling.h" +#ifdef _WIN32 +#define NOMINMAX +#endif + #include #include #include From 22927b1c0a76ef4cb34cb80432cd2e5352a77b4e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 13 Jan 2025 12:08:18 +0100 Subject: [PATCH 7/8] move common_get_hf_file to common.cpp --- common/arg.cpp | 84 ---------------------------------------- common/common.cpp | 99 +++++++++++++++++++++++++++++++++++++++++++++++ common/common.h | 27 ++----------- 3 files changed, 102 insertions(+), 108 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index d8075171b4d72..8c7dcc75151b3 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -3,10 +3,6 @@ #include "log.h" #include "sampling.h" -#ifdef _WIN32 -#define NOMINMAX -#endif - #include #include #include @@ -132,86 +128,6 @@ std::string common_arg::to_string() { // utils // -#if defined(LLAMA_USE_CURL) -/** - * Allow getting the HF file from the HF repo with tag (like ollama), for example: - * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 - * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M - * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s - * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) - * - * Return pair of (with "repo" already having tag removed) - * - * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. - */ -static std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { - auto parts = string_split(hf_repo_with_tag, ':'); - std::string tag = parts.size() > 1 ? parts.back() : "latest"; - std::string hf_repo = parts[0]; - if (string_split(hf_repo, '/').size() != 2) { - throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); - } - - // fetch model info from Hugging Face Hub API - json model_info; - curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); - curl_slist_ptr http_headers; - std::string res_str; - std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; - curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); - typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); - auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { - static_cast(data)->append((char * ) ptr, size * nmemb); - return size * nmemb; - }; - curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); - curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); -#if defined(_WIN32) - curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); -#endif - if (!hf_token.empty()) { - std::string auth_header = "Authorization: Bearer " + hf_token; - http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); - } - // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response - http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); - http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); - - CURLcode res = curl_easy_perform(curl.get()); - - if (res != CURLE_OK) { - throw std::runtime_error("error: cannot make GET request to HF API"); - } - - long res_code; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); - if (res_code == 200) { - model_info = json::parse(res_str); - } else if (res_code == 401) { - throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); - } else { - throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); - } - - // check response - if (!model_info.contains("ggufFile")) { - throw std::runtime_error("error: model does not have ggufFile"); - } - json & gguf_file = model_info.at("ggufFile"); - if (!gguf_file.contains("rfilename")) { - throw std::runtime_error("error: ggufFile does not have rfilename"); - } - - return std::make_pair(hf_repo, gguf_file.at("rfilename")); -} -#else -static std::pair common_get_hf_file(const std::string &, const std::string &) { - throw std::runtime_error("error: llama.cpp built without libcurl"); -} -#endif - static void common_params_handle_model_default( std::string & model, const std::string & model_url, diff --git a/common/common.cpp b/common/common.cpp index e7860faeec143..c0a52d8d195a8 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -52,6 +52,11 @@ #include #include #endif +#if defined(LLAMA_USE_CURL) +#include +#include +#include +#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -68,6 +73,22 @@ #include #endif #define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 + +// +// CURL utils +// + +using curl_ptr = std::unique_ptr; + +// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one +struct curl_slist_ptr { + struct curl_slist * ptr = nullptr; + ~curl_slist_ptr() { + if (ptr) { + curl_slist_free_all(ptr); + } + } +}; #endif // LLAMA_USE_CURL using json = nlohmann::ordered_json; @@ -1438,6 +1459,80 @@ struct llama_model * common_load_model_from_hf( return common_load_model_from_url(model_url, local_path, hf_token, params); } +/** + * Allow getting the HF file from the HF repo with tag (like ollama), for example: + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 + * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s + * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) + * + * Return pair of (with "repo" already having tag removed) + * + * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. + */ +std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { + auto parts = string_split(hf_repo_with_tag, ':'); + std::string tag = parts.size() > 1 ? parts.back() : "latest"; + std::string hf_repo = parts[0]; + if (string_split(hf_repo, '/').size() != 2) { + throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); + } + + // fetch model info from Hugging Face Hub API + json model_info; + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; + std::string res_str; + std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); + auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { + static_cast(data)->append((char * ) ptr, size * nmemb); + return size * nmemb; + }; + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); +#if defined(_WIN32) + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + if (!hf_token.empty()) { + std::string auth_header = "Authorization: Bearer " + hf_token; + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + } + // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response + http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); + http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + + CURLcode res = curl_easy_perform(curl.get()); + + if (res != CURLE_OK) { + throw std::runtime_error("error: cannot make GET request to HF API"); + } + + long res_code; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); + if (res_code == 200) { + model_info = json::parse(res_str); + } else if (res_code == 401) { + throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); + } else { + throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); + } + + // check response + if (!model_info.contains("ggufFile")) { + throw std::runtime_error("error: model does not have ggufFile"); + } + json & gguf_file = model_info.at("ggufFile"); + if (!gguf_file.contains("rfilename")) { + throw std::runtime_error("error: ggufFile does not have rfilename"); + } + + return std::make_pair(hf_repo, gguf_file.at("rfilename")); +} + #else struct llama_model * common_load_model_from_url( @@ -1459,6 +1554,10 @@ struct llama_model * common_load_model_from_hf( return nullptr; } +std::pair common_get_hf_file(const std::string &, const std::string &) { + throw std::runtime_error("error: llama.cpp built without libcurl, downloading from Hugging Face not supported."); +} + #endif // LLAMA_USE_CURL // diff --git a/common/common.h b/common/common.h index 3d0da54d7dd84..c86a4ef39212b 100644 --- a/common/common.h +++ b/common/common.h @@ -8,12 +8,6 @@ #include #include -#if defined(LLAMA_USE_CURL) -#include -#include -#include -#endif - #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' #else @@ -512,6 +506,9 @@ struct llama_model * common_load_model_from_hf( const std::string & local_path, const std::string & hf_token, const struct llama_model_params & params); +std::pair common_get_hf_file( + const std::string & hf_repo_with_tag, + const std::string & hf_token); // clear LoRA adapters from context, then apply new list of adapters void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); @@ -667,22 +664,4 @@ const char * const LLM_KV_SPLIT_NO = "split.no"; const char * const LLM_KV_SPLIT_COUNT = "split.count"; const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; -#if defined(LLAMA_USE_CURL) -// -// CURL utils -// - -using curl_ptr = std::unique_ptr; - -// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one -struct curl_slist_ptr { - struct curl_slist * ptr = nullptr; - ~curl_slist_ptr() { - if (ptr) { - curl_slist_free_all(ptr); - } - } -}; -#endif - } From 8bd5b18ce1d6ed3b2a15615b59c88acc95e1ad4c Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 13 Jan 2025 12:45:32 +0100 Subject: [PATCH 8/8] fix complain with noreturn --- common/arg.cpp | 13 +++++-------- common/common.cpp | 3 ++- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8c7dcc75151b3..1457a360faab2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -138,15 +138,12 @@ static void common_params_handle_model_default( // short-hand to avoid specifying --hf-file -> default it to --model if (hf_file.empty()) { if (model.empty()) { - try { - auto auto_detected = common_get_hf_file(hf_repo, hf_token); - hf_repo = auto_detected.first; - hf_file = auto_detected.second; - LOG_INF("%s: using hf_file = %s\n", __func__, hf_file.c_str()); - } catch (std::exception & e) { - fprintf(stderr, "%s: %s\n", __func__, e.what()); - exit(1); + auto auto_detected = common_get_hf_file(hf_repo, hf_token); + if (auto_detected.first.empty() || auto_detected.second.empty()) { + exit(1); // built without CURL, error message already printed } + hf_repo = auto_detected.first; + hf_file = auto_detected.second; } else { hf_file = model; } diff --git a/common/common.cpp b/common/common.cpp index c0a52d8d195a8..a6f9252b27a9f 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1555,7 +1555,8 @@ struct llama_model * common_load_model_from_hf( } std::pair common_get_hf_file(const std::string &, const std::string &) { - throw std::runtime_error("error: llama.cpp built without libcurl, downloading from Hugging Face not supported."); + LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + return std::make_pair("", ""); } #endif // LLAMA_USE_CURL