From 9d7d5f21f897e801bde35f437ceab504ca65e08b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 12:42:47 +0100
Subject: [PATCH 1/5] cli : auto activate conversation mode if chat template is
 detected

---
 common/arg.cpp         | 20 ++++++++++++--------
 common/common.h        |  9 ++++++++-
 examples/main/main.cpp | 37 +++++++++++++++++++++++++++----------
 3 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 27886b84e862c..017501310df93 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -768,15 +768,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
     add_opt(common_arg(
         {"-cnv", "--conversation"},
-        string_format(
-            "run in conversation mode:\n"
-            "- does not print special tokens and suffix/prefix\n"
-            "- interactive mode is also enabled\n"
-            "(default: %s)",
-            params.conversation ? "true" : "false"
-        ),
+        "run in conversation mode:\n"
+        "- does not print special tokens and suffix/prefix\n"
+        "- interactive mode is also enabled\n"
+        "(default: auto enabled if chat template is available)",
+        [](common_params & params) {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+        }
+    ).set_examples({LLAMA_EXAMPLE_MAIN}));
+    add_opt(common_arg(
+        {"-no-cnv", "--no-conversation"},
+        "force disable conversation mode (default: false)",
         [](common_params & params) {
-            params.conversation = true;
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
         }
     ).set_examples({LLAMA_EXAMPLE_MAIN}));
     add_opt(common_arg(
diff --git a/common/common.h b/common/common.h
index d523948b03e30..322c03a22e748 100644
--- a/common/common.h
+++ b/common/common.h
@@ -103,6 +103,12 @@ enum dimre_method {
     DIMRE_METHOD_MEAN,
 };
 
+enum common_conversation_mode {
+    COMMON_CONVERSATION_MODE_DISABLED = 0,
+    COMMON_CONVERSATION_MODE_ENABLED  = 1,
+    COMMON_CONVERSATION_MODE_AUTO     = 2,
+};
+
 // sampling parameters
 struct common_params_sampling {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
@@ -275,7 +281,6 @@ struct common_params {
     bool special           = false; // enable special token output
     bool interactive       = false; // interactive mode
     bool interactive_first = false; // wait for user input immediately
-    bool conversation      = false; // conversation mode (does not print special tokens and suffix/prefix)
     bool prompt_cache_all  = false; // save user input and generations to prompt cache
     bool prompt_cache_ro   = false; // open the prompt cache read-only and do not update it
 
@@ -301,6 +306,8 @@ struct common_params {
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
+    common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
+
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
     std::vector<std::string> image; // path to image file(s)
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 640b35c1d73dd..7148e3fc356ac 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -30,6 +30,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+static const char * DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant";
+
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static common_sampler          ** g_smpl;
@@ -204,8 +206,17 @@ int main(int argc, char ** argv) {
         LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n", __func__, n_ctx_train, n_ctx);
     }
 
+    // auto enable conversation mode if chat template is available
+    if (
+        params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO
+        && (!common_get_builtin_chat_template(model).empty() || !params.chat_template.empty())
+    ) {
+        LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
+        params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+    }
+
     // print chat template example in conversation mode
-    if (params.conversation) {
+    if (params.conversation_mode) {
         if (params.enable_chat_template) {
             LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
         } else {
@@ -252,8 +263,10 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> embd_inp;
 
     {
-        auto prompt = (params.conversation && params.enable_chat_template && !params.prompt.empty())
-            ? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
+        auto prompt = (params.conversation_mode && params.enable_chat_template)
+            // format the system prompt in conversation mode (fallback to default if empty)
+            ? chat_add_and_format(model, chat_msgs, "system", params.prompt.empty() ? DEFAULT_SYSTEM_MESSAGE : params.prompt)
+            // otherwise use the prompt as is
             : params.prompt;
         if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
             LOG_DBG("tokenize the prompt\n");
@@ -327,7 +340,7 @@ int main(int argc, char ** argv) {
         params.n_keep += add_bos; // always keep the BOS token
     }
 
-    if (params.conversation) {
+    if (params.conversation_mode) {
         params.interactive_first = true;
     }
 
@@ -451,7 +464,11 @@ int main(int argc, char ** argv) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
         LOG_INF(       " - Press Ctrl+C to interject at any time.\n");
 #endif
-        LOG_INF(       "%s\n", control_message);
+        LOG_INF(       "%s", control_message);
+        if (params.conversation_mode && params.enable_chat_template && params.prompt.empty()) {
+            LOG_INF(   " - Using default system message. To change it, set a different value via -p PROMPT or -f FILE argument.\n");
+        }
+        LOG_INF("\n");
 
         is_interacting = params.interactive_first;
     }
@@ -763,7 +780,7 @@ int main(int argc, char ** argv) {
             }
 
             // if current token is not EOG, we add it to current assistant message
-            if (params.conversation) {
+            if (params.conversation_mode) {
                 const auto id = common_sampler_last(smpl);
                 assistant_ss << common_token_to_piece(ctx, id, false);
             }
@@ -771,7 +788,7 @@ int main(int argc, char ** argv) {
             if (n_past > 0 && is_interacting) {
                 LOG_DBG("waiting for user input\n");
 
-                if (params.conversation) {
+                if (params.conversation_mode) {
                     LOG("\n> ");
                 }
 
@@ -781,7 +798,7 @@ int main(int argc, char ** argv) {
                 }
 
                 std::string buffer;
-                if (!params.input_prefix.empty() && !params.conversation) {
+                if (!params.input_prefix.empty() && !params.conversation_mode) {
                     LOG_DBG("appending input prefix: '%s'\n", params.input_prefix.c_str());
                     LOG("%s", params.input_prefix.c_str());
                 }
@@ -805,7 +822,7 @@ int main(int argc, char ** argv) {
                 // Entering a empty line lets the user pass control back
                 if (buffer.length() > 1) {
                     // append input suffix if any
-                    if (!params.input_suffix.empty() && !params.conversation) {
+                    if (!params.input_suffix.empty() && !params.conversation_mode) {
                         LOG_DBG("appending input suffix: '%s'\n", params.input_suffix.c_str());
                         LOG("%s", params.input_suffix.c_str());
                     }
@@ -818,7 +835,7 @@ int main(int argc, char ** argv) {
                         string_process_escapes(buffer);
                     }
 
-                    bool format_chat = params.conversation && params.enable_chat_template;
+                    bool format_chat = params.conversation_mode && params.enable_chat_template;
                     std::string user_inp = format_chat
                         ? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
                         : std::move(buffer);

From 5b1f71086589e2e1c3d18e2c0bf0faf09974c52d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 13:54:53 +0100
Subject: [PATCH 2/5] add warn on bad template

---
 examples/main/main.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 7148e3fc356ac..fb5d6bcb1fe32 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -207,14 +207,17 @@ int main(int argc, char ** argv) {
     }
 
     // auto enable conversation mode if chat template is available
-    if (
-        params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO
-        && (!common_get_builtin_chat_template(model).empty() || !params.chat_template.empty())
-    ) {
+    const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
+    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO && has_chat_template) {
         LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
         params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
     }
 
+    // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning
+    if (params.conversation_mode && !has_chat_template) {
+        LOG_WRN("%s: chat template is not available or is not supported. This may cause the model to output suboptimal responses\n", __func__);
+    }
+
     // print chat template example in conversation mode
     if (params.conversation_mode) {
         if (params.enable_chat_template) {

From 723d77cebaf0406edf254823f52c0171b31047a6 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 16:37:53 +0100
Subject: [PATCH 3/5] update readme (writing with the help of chatgpt)

---
 README.md | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 6302ac977aa94..a095c28ee32e5 100644
--- a/README.md
+++ b/README.md
@@ -263,21 +263,12 @@ To learn more about model quantization, [read this documentation](examples/quant
 #### A CLI tool for accessing and experimenting with most of `llama.cpp`'s functionality.
 
 - <details open>
-    <summary>Run simple text completion</summary>
-
-    ```bash
-    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128
-
-    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
-    ```
-
-    </details>
-
-- <details>
     <summary>Run in conversation mode</summary>
 
+    Models with a built-in chat template will automatically activate conversation mode. If this doesn't occur, you can manually enable it by adding `-cnv` and specifying a suitable chat template with `--chat-template NAME`
+
     ```bash
-    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv
+    llama-cli -m model.gguf
 
     # > hi, who are you?
     # Hi there! I'm your helpful assistant! I'm an AI-powered chatbot designed to assist and provide information to users like you. I'm here to help answer your questions, provide guidance, and offer support on a wide range of topics. I'm a friendly and knowledgeable AI, and I'm always happy to help with anything you need. What's on your mind, and how can I assist you today?
@@ -289,17 +280,28 @@ To learn more about model quantization, [read this documentation](examples/quant
     </details>
 
 - <details>
-    <summary>Run with custom chat template</summary>
+    <summary>Run in conversation mode with custom chat template</summary>
 
     ```bash
-    # use the "chatml" template
-    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --chat-template chatml
+    # use the "chatml" template (use -h to see the list of supported templates)
+    llama-cli -m model.gguf -cnv --chat-template chatml
 
     # use a custom template
-    llama-cli -m model.gguf -p "You are a helpful assistant" -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
+    llama-cli -m model.gguf -cnv --in-prefix 'User: ' --reverse-prompt 'User:'
     ```
 
-    [Supported templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+    </details>
+
+- <details>
+    <summary>Run simple text completion</summary>
+
+    To disable conversation mode explicitly, use `-no-cnv`
+
+    ```bash
+    llama-cli -m model.gguf -p "I believe the meaning of life is" -n 128 -no-cnv
+
+    # I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don't align with societal expectations. I think that's what I love about yoga – it's not just a physical practice, but a spiritual one too. It's about connecting with yourself, listening to your inner voice, and honoring your own unique journey.
+    ```
 
     </details>
 

From dd221bdd6b0bef0a0c266e1b75be236f8c89456d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 16:45:23 +0100
Subject: [PATCH 4/5] update readme (2)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index a095c28ee32e5..413a164229d91 100644
--- a/README.md
+++ b/README.md
@@ -245,6 +245,8 @@ The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](htt
 - [Trending](https://huggingface.co/models?library=gguf&sort=trending)
 - [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
 
+You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from Hugging Face by using this CLI argument: `-hf <user>/<model>[:quant]`
+
 After downloading a model, use the CLI tools to run it locally - see below.
 
 `llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.

From 73c4e77de89d6084de66e3f57ae1bf1749d16f74 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Mon, 13 Jan 2025 19:40:07 +0100
Subject: [PATCH 5/5] do not activate -cnv for non-instruct models

---
 examples/main/main.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index fb5d6bcb1fe32..39666a0e8a83a 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -208,9 +208,13 @@ int main(int argc, char ** argv) {
 
     // auto enable conversation mode if chat template is available
     const bool has_chat_template = !common_get_builtin_chat_template(model).empty() || !params.chat_template.empty();
-    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO && has_chat_template) {
-        LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
-        params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+    if (params.conversation_mode == COMMON_CONVERSATION_MODE_AUTO) {
+        if (has_chat_template) {
+            LOG_INF("%s: chat template is available, enabling conversation mode (disable it with -no-cnv)\n", __func__);
+            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
+        } else {
+            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        }
     }
 
     // in case user force-activate conversation mode (via -cnv) without proper chat template, we show a warning