Server: enable lookup decoding

ggerganov · May 12, 2024 · 71c98cc · 71c98cc
1 parent dc685be
commit 71c98cc
Show file tree

Hide file tree

Showing 15 changed files with 351 additions and 109 deletions.
diff --git a/Makefile b/Makefile
@@ -824,7 +824,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o ngram-cache.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
 

diff --git a/build.zig b/build.zig
@@ -124,6 +124,7 @@ pub fn build(b: *std.build.Builder) !void {
     const console = make.obj("console", "common/console.cpp");
     const sampling = make.obj("sampling", "common/sampling.cpp");
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
+    const ngram_cache = make.obj("ngram-cache", "common/ngram-cache.cpp");
     const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
     const train = make.obj("train", "common/train.cpp");
     const clip = make.obj("clip", "examples/llava/clip.cpp");
@@ -136,7 +137,7 @@ pub fn build(b: *std.build.Builder) !void {
     _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
     _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
 
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, ngram_cache, clip, llava });
     if (server.target.isWindows()) {
         server.linkSystemLibrary("ws2_32");
     }

diff --git a/common/common.cpp b/common/common.cpp
@@ -1594,9 +1594,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
     printf("                        path under which to save YAML logs (no logging if unset)\n");
     printf("  -lcs FNAME, --lookup-cache-static FNAME\n");
-    printf("                        path to static lookup cache to use for lookup decoding (not updated by generation)\n");
+    printf("                        path to static lookup cache to use for n-gram lookup decoding (not updated by generation)\n");
     printf("  -lcd FNAME, --lookup-cache-dynamic FNAME\n");
-    printf("                        path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
+    printf("                        path to dynamic lookup cache to use for n-gram lookup decoding (updated by generation)\n");
     printf("  --override-kv KEY=TYPE:VALUE\n");
     printf("                        advanced option to override model metadata by key. may be specified multiple times.\n");
     printf("                        types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");

diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp
@@ -6,19 +6,18 @@
 #include <fstream>
 
 void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
-                              std::vector<llama_token> & inp, int nnew, bool print_progress) {
+                              llama_token * inp_data, int inp_size, int nnew, bool print_progress) {
     const int64_t t_start_ms = ggml_time_ms();
-    const int64_t inp_size = inp.size();
 
     const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1);
     int64_t n_done = 0;
 
     for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) {
-        const int64_t i_start = std::max(inp_size - nnew, ngram_size);
+        const int64_t i_start = std::max((int64_t)(inp_size - nnew), ngram_size);
         for (int64_t i = i_start; i < inp_size; ++i) {
             const int64_t ngram_start = i - ngram_size;
-            llama_ngram ngram(&inp[ngram_start], ngram_size);
-            const llama_token token = inp[i];
+            llama_ngram ngram(inp_data + ngram_start, ngram_size);
+            const llama_token token = inp_data[i];
 
             llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
             if (part_it == ngram_cache.end()) {
@@ -48,8 +47,8 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
 }
 
 // Helper function to get a token from the combined, speculative sequence of inp and draft.
-static llama_token get_token(const std::vector<llama_token> & inp, const std::vector<llama_token> & draft, const size_t i) {
-    return i < inp.size() ? inp[i] : draft[1 + i - inp.size()];
+static llama_token get_token(const llama_token * inp_data, const int inp_size, const std::vector<llama_token> & draft, const int i) {
+    return i < inp_size ? inp_data[i] : draft[1 + i - inp_size];
 }
 
 // If sample size or percentage are below these thresholds the draft is aborted early:
@@ -140,11 +139,10 @@ static llama_token try_draft(
 }
 
 void llama_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    llama_token * inp_data, int inp_size, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
     llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
 ) {
     GGML_ASSERT(draft.size() == 1);
-    const int inp_size = inp.size();
 
     if (inp_size < LLAMA_NGRAM_STATIC) {
         return;
@@ -156,7 +154,7 @@ void llama_ngram_cache_draft(
         const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
         llama_ngram ngram_static;
         for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
-            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
+            ngram_static.tokens[j-ngram_start_static] = get_token(inp_data, inp_size, draft, j);
         }
         llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
         llama_ngram_cache_part part_static;
@@ -170,7 +168,7 @@ void llama_ngram_cache_draft(
             const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
             llama_ngram ngram_cd;
             for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
-                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
+                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp_data, inp_size, draft, j);
             }
             ngrams_cd.push_back(ngram_cd);
         }
@@ -216,12 +214,11 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
 
 }
 
-llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
+bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename) {
     std::ifstream hashmap_file(filename, std::ios::binary);
     if (!hashmap_file) {
-        throw std::ifstream::failure("Unable to open file " + filename);
+        return false;
     }
-    llama_ngram_cache ngram_cache;
 
     llama_ngram ngram;
     int32_t     ntokens;
@@ -251,7 +248,7 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
     }
     GGML_ASSERT(hashmap_file.eof());
 
-    return ngram_cache;
+    return true;
 }
 
 void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {

diff --git a/common/ngram-cache.h b/common/ngram-cache.h
@@ -37,12 +37,21 @@ struct llama_ngram {
     }
 };
 
+struct llama_token_hash_function {
+    size_t operator()(const llama_token token) const {
+        // see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
+        return token * 11400714819323198485llu;
+    }
+};
+
 struct llama_ngram_hash_function {
     size_t operator()(const llama_ngram & ngram) const {
-        size_t hash = 0;
-        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
-            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
+        size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
+
+        for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= llama_token_hash_function{}(ngram.tokens[i]);
         }
+
         return hash;
     }
 };
@@ -64,7 +73,7 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
 // In order to get correct results inp_data can ONLY BE APPENDED TO.
 // Changes in the middle need a complete rebuild.
 void llama_ngram_cache_update(
-    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
+    llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, llama_token * inp_data, int inp_size, int nnew, bool print_progress);
 
 // Try to draft tokens from ngram caches.
 // inp:                the tokens generated so far.
@@ -75,7 +84,7 @@ void llama_ngram_cache_update(
 // nc_dynamic:         ngram cache based on previous user generations.
 // nc_static:          ngram cache generated from a large text corpus, used for validation.
 void llama_ngram_cache_draft(
-    std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
+    llama_token * inp_data, int inp_size, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
     llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
 
 // Save an ngram cache to a file.
@@ -84,9 +93,10 @@ void llama_ngram_cache_draft(
 void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
 
 // Load an ngram cache saved with llama_ngram_cache_save.
+// ngram_cache: the ngram cache to load the data into.
 // filename: the path from which to load the ngram cache.
 // returns:  an ngram cache containing the information saved to filename.
-llama_ngram_cache llama_ngram_cache_load(std::string & filename);
+bool llama_ngram_cache_load(llama_ngram_cache & ngram_cache, std::string & filename);
 
 // Merge two ngram caches.
 // ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.

diff --git a/examples/lookup/README.md b/examples/lookup/README.md
@@ -1,13 +1,82 @@
 # llama.cpp/examples/lookup
 
-Demonstration of Prompt Lookup Decoding
+Demonstration of speculative decoding using n-gram lookup.
+Initial version was based on https://github.com/apoorvumang/prompt-lookup-decoding .
+The current version uses three separate types of "n-gram caches".
+Each of these caches maps how frequently a given n-gram is followed by a specific token.
+The difference between the caches lies in what data is used to build them:
 
-https://github.com/apoorvumang/prompt-lookup-decoding
+* The "context" cache is built using the tokens in the current context of a user generation.
+* The "dynamic" cache is built by merging the context caches of previous user generations.
+* The "static" cache is built from a large text corpus with no relation to the current context.
 
-The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft`. The first two determine the size of the ngrams to search for in the prompt for a match. The latter specifies how many subsequent tokens to draft if a match is found.
+The tradeoff between these caches lies in relevance to the current context vs. the emount of input data.
+When trying to draft a new token using n-gram lookup the algorithm is as follows:
 
-More info:
+* Try to draft a suitable token from the context cache. If a static cache is available, use it to validate the draft candidates. This is done by simply multiplying the frequencies of the two caches.
+* Try to draft a suitable token from the dynamic cache, validate with static cache if available.
+* Try to draft a suitable token from the static cache.
 
-https://github.com/ggerganov/llama.cpp/pull/4484
-https://github.com/ggerganov/llama.cpp/issues/4226
+Only a single token sequence with the most likely token candidates is drafted.
+All tokens must pass thresholds for frequency and sample size in order to be drafted.
 
+Relevant command line arguments:
+
+- `--draft`: maximum number of additional tokens to draft using n-gram lookup. Default: 5. Set to 0 to disable n-gram lookup. **Results are not deterministic with n-gram lookup enabled due to varying batch size.**
+- `-lcs FNAME, --lookup-cache-static FNAME`: optional path to static lookup cache to use for n-gram lookup. Created from a large, unspecific text corpus using `lookup-create`.
+- `-lcd FNAME, --lookup-cache-dynamic FNAME`: optional path to dynamic lookup cache to use for n-gram lookup. Contains data from previous generations. Automatically created and filled while the server is running but by default discarded on server exit. Setting this argument tries to initialize the dynamic cache from a file and saves it to said file on server shutdown.
+
+N-gram lookup caches saved to disk are compatible between models as long as they use the same tokenizer
+(but for dynamic caches the resulting drafted tokens may be wrong which means there is no speedup).
+Furthermore, the data format for both types of caches is the same so they can be used interchangeably (but probably not with good results).
+
+## Usage Examples
+
+### `lookup`
+
+Generation using n-gram lookup:
+
+``` sh
+./lookup --model models/opt/llama_2-7b-q4_0.gguf -ngl 99 --n-predict 256 --ignore-eos --draft 3 --color --prompt "Write a love story about two stars that tragically ends in a type Ia supernova. Use a lot of emotional and dramatic language."
+```
+
+The `--color` flag highlights the successfully predicted tokens.
+The `--lookup-cache-static` and `--lookup-cache-dynamic` arguments can be set to provide static/dynamic caches.
+
+### `lookup-stats`
+
+Determine n-gram lookup effectiveness for a given text corpus (similar to `perplexity`):
+
+``` sh
+./lookup-stats --model /opt/models/llama_2-7b-q4_0.gguf --file wikitext-2-raw/wiki.test.raw --draft 3
+```
+
+The `--lookup-cache-static` and `--lookup-cache-dynamic` arguments can be set to provide static/dynamic caches.
+
+### `lookup-create`
+
+Create a static lookup cache from a text corpus:
+
+``` sh
+./lookup-create --model /opt/models/llama_2-7b-q4_0.gguf --lookup-cache-static wt103-llama_2.lcs --file wikitext-103-raw/wiki.train.raw
+```
+
+The `--lookup-cache-static` argument must be set to provide the path to which the static lookup cache will be saved.
+The tokenizer for which to create the cache is taken from the provided model.
+
+### `lookup-merge`
+
+Merge two lookup caches into one:
+
+``` sh
+./lookup-merge cache_1.lcs cache_2.lcs cache_merged.lcs
+```
+
+Can be used for both static and dynamic lookup caches.
+
+## More info:
+
+* https://github.com/ggerganov/llama.cpp/pull/4484
+* https://github.com/ggerganov/llama.cpp/issues/4226
+* https://github.com/ggerganov/llama.cpp/pull/5479
+* https://github.com/ggerganov/llama.cpp/pull/6828
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
@@ -34,7 +34,7 @@ int main(int argc, char ** argv){
 
 
     llama_ngram_cache ngram_cache;
-    llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
+    llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp.data(), inp.size(), inp.size(), true);
     fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
 
     llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);

diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp
@@ -33,11 +33,13 @@ int main(int argc, char ** argv){
     }
 
     fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
-    llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
+    llama_ngram_cache ngram_cache_merged;
+    GGML_ASSERT(llama_ngram_cache_load(ngram_cache_merged, args[0]));
 
     for (size_t i = 1; i < args.size()-1; ++i) {
         fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
-        llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
+        llama_ngram_cache ngram_cache;
+        GGML_ASSERT(llama_ngram_cache_load(ngram_cache, args[i]));
 
         llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
     }

diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
@@ -30,7 +30,6 @@ int main(int argc, char ** argv){
 
     // load the model
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    GGML_ASSERT(llama_n_vocab(model) < (1 << 16));
 
     // tokenize the prompt
     std::vector<llama_token> inp;
@@ -46,18 +45,15 @@ int main(int argc, char ** argv){
         const int64_t t_start_draft_us = ggml_time_us();
 
         if (!params.lookup_cache_static.empty()) {
-            try {
-                ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
-            } catch (std::ifstream::failure const &) {
+            if(!llama_ngram_cache_load(ngram_cache_static, params.lookup_cache_static)) {
                 fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
                 exit(1);
             }
         }
 
         if (!params.lookup_cache_dynamic.empty()) {
-            try {
-                ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
-            } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
+            // If the dynamic lookup cache doesn't exist it will be created at the end of the program:
+            llama_ngram_cache_load(ngram_cache_dynamic, params.lookup_cache_dynamic);
         }
 
         t_draft_flat_us += ggml_time_us() - t_start_draft_us;
@@ -85,7 +81,9 @@ int main(int argc, char ** argv){
 
             {
                 const int64_t t_start_draft_us = ggml_time_us();
-                llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
+                llama_ngram_cache_draft(
+                    pseudo_output.data(), pseudo_output.size(), draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX,
+                    ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
                 t_draft_us += ggml_time_us() - t_start_draft_us;
             }
 
@@ -104,7 +102,8 @@ int main(int argc, char ** argv){
 
                 {
                     const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    llama_ngram_cache_update(
+                        ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output.data(), pseudo_output.size(), 1, false);
                     t_draft_us += ggml_time_us() - t_start_draft_us;
                 }
             }
@@ -114,7 +113,8 @@ int main(int argc, char ** argv){
                 pseudo_output.push_back(inp_slice[pseudo_output.size()]);
                 {
                     const int64_t t_start_draft_us = ggml_time_us();
-                    llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
+                    llama_ngram_cache_update(
+                        ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output.data(), pseudo_output.size(), 1, false);
                     t_draft_us += ggml_time_us() - t_start_draft_us;
                 }
             }