From ece0b0d855eb5c53ec9838f6a57b6afc6c6ba573 Mon Sep 17 00:00:00 2001
From: slaren <slarengh@gmail.com>
Date: Sat, 6 Jan 2024 00:58:22 +0100
Subject: [PATCH] improve graph splitting, partial fix for --no-kv-offload

---
 ggml-backend.c | 132 +++++++++++++++++++++++++++++++++++++++++++------
 ggml-cuda.cu   |  25 +---------
 llama.cpp      |  11 ++++-
 3 files changed, 127 insertions(+), 41 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index 8e791827255e0..31516b4c55efe 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -737,9 +737,16 @@ struct ggml_backend_sched_split {
     int i_end;
     struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
     int n_inputs;
+    // graph view of this split
     struct ggml_cgraph graph;
 };
 
+// TODO: group all the hash values into a single struct for clarity
+//struct sched_hash_value {
+//    ggml_tallocr_t tallocr;
+//    struct ggml_tensor * copies[GGML_MAX_BACKENDS];
+//};
+
 struct ggml_backend_sched {
     int n_backends;
     ggml_backend_t backends[GGML_MAX_BACKENDS];
@@ -747,11 +754,15 @@ struct ggml_backend_sched {
 
     ggml_gallocr_t galloc;
 
+    // hash keys of the nodes in the graph
     struct ggml_hash_set    hash_set;
-    ggml_tallocr_t *        node_talloc;                     // [hash_set.size]
-    struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
+    // hash values (arrays of [hash_set.size])
+    ggml_tallocr_t *        node_talloc;                     // tallocr assigned to each node (indirectly this is the backend)
+    struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
 
+    // copy of the graph with modified inputs
     struct ggml_cgraph * graph;
+
     struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
     int n_splits;
 
@@ -928,6 +939,12 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co
     return dup;
 }
 
+
+//#define DEBUG_PASS1
+//#define DEBUG_PASS2
+//#define DEBUG_PASS3
+//#define DEBUG_PASS4
+
 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
 // TODO: merge passes
 static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
@@ -977,17 +994,32 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
             node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
         }
     }
-    //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS1
+    fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
 
     // pass 2: assign backends to ops from current assignments
     // start from the end and assign the same backend to previous ops
+
+    // expand gpu backends (ie non last prio) up and down, ignoring cpu
+    // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
+
+    // pass 2.1 expand gpu up
     {
         ggml_tallocr_t cur_allocr = NULL;
         for (int i = graph->n_nodes - 1; i >= 0; i--) {
             struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
             ggml_tallocr_t node_allocr = node_allocr(node);
             if (node_allocr != NULL) {
-                cur_allocr = node_allocr;
+                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+                    cur_allocr = NULL;
+                }
+                else {
+                    cur_allocr = node_allocr;
+                }
             } else {
                 node_allocr(node) = cur_allocr;
                 SET_CAUSE(node, "2.cur");
@@ -995,12 +1027,58 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         }
     }
 
-    //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+    // pass 2.2 expand gpu down
+    {
+        ggml_tallocr_t cur_allocr = NULL;
+        for (int i = 0; i < graph->n_nodes; i++) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            ggml_tallocr_t node_allocr = node_allocr(node);
+            if (node_allocr != NULL) {
+                if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+                    cur_allocr = NULL;
+                }
+                else {
+                    cur_allocr = node_allocr;
+                }
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.cur");
+            }
+        }
+    }
+
+    // pass 2.3 expand rest up
+    {
+        ggml_tallocr_t cur_allocr = NULL;
+        for (int i = graph->n_nodes - 1; i >= 0; i--) {
+            struct ggml_tensor * node = graph->nodes[i];
+            if (ggml_is_view_op(node->op)) {
+                continue;
+            }
+            ggml_tallocr_t node_allocr = node_allocr(node);
+            if (node_allocr != NULL) {
+                cur_allocr = node_allocr;
+            } else {
+                node_allocr(node) = cur_allocr;
+                SET_CAUSE(node, "2.cur");
+            }
+        }
+    }
+#ifdef DEBUG_PASS2
+    fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
 
-    // pass 3: assign backends to remaining src from dst (should only be leafs)
+    // pass 3: assign backends to remaining src from dst and view_src
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
-        ggml_tallocr_t node_allocr = node_allocr(node);
+        ggml_tallocr_t cur_allocr = node_allocr(node);
+        if (ggml_is_view_op(node->op) && cur_allocr == NULL) {
+            cur_allocr = node_allocr(node) = node_allocr(node->view_src);
+            SET_CAUSE(node, "3.vsrc");
+        }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
@@ -1008,11 +1086,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
             }
             ggml_tallocr_t src_allocr = node_allocr(src);
             if (src_allocr == NULL) {
-                node_allocr(src) = node_allocr;
+                if (src->view_src != NULL) {
+                    // views are always on the same backend as the source
+                    node_allocr(src) = node_allocr(src->view_src);
+                } else {
+                    node_allocr(src) = cur_allocr;
+                }
             }
         }
     }
-    //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS3
+    fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
 
     // pass 4: split graph, find tensors that need to be copied
     {
@@ -1074,7 +1159,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                         sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
                     }
 
-                    // create copies
+                    // create a copy of the input in the split's backend
                     size_t id = hash_id(src);
                     if (sched->node_copies[id][cur_backend_id] == NULL) {
                         struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
@@ -1090,8 +1175,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         sched->splits[cur_split].i_end = graph->n_nodes;
         sched->n_splits = cur_split + 1;
     }
-
-    //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS4
+    fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
 
 #ifndef NDEBUG
     // sanity check: all sources should have the same backend as the node
@@ -1101,6 +1187,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         if (node_allocr == NULL) {
             fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
         }
+        if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
+            fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
+                node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
+                node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
+        }
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             struct ggml_tensor * src = node->src[j];
             if (src == NULL) {
@@ -1112,8 +1203,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
                     node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
                     j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
             }
+            if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
+                fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
+                    src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
+                    src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
+            }
         }
     }
+    fflush(stderr);
 #endif
 
     // create copies of the graph for each split
@@ -1127,6 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
         for (int j = 0; j < split->n_inputs; j++) {
             struct ggml_tensor * input = split->inputs[j];
             struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
+            // add a dependency to the input source so that it is not freed before the copy is done
             input_cpy->src[0] = input;
             graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
         }
@@ -1163,19 +1261,20 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
             struct ggml_tensor * input = split->inputs[j];
             struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
             if (input->buffer == NULL) {
+                GGML_ASSERT(false);
                 if (input->view_src == NULL) {
                     fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
-                    exit(1);
+                    GGML_ASSERT(false);
                 }
                 // FIXME: may need to use the sched buffer instead
                 ggml_backend_view_init(input->view_src->buffer, input);
             }
             if (input_cpy->buffer == NULL) {
                 fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
-                exit(1);
+                GGML_ASSERT(false);
             }
-            //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
-            //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
+            // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
+            // this is important to avoid copying constants such as KQ_mask and inp_pos multiple time
             ggml_backend_tensor_copy(input, input_cpy);
         }
         // ggml_backend_synchronize(split_backend);
@@ -1301,6 +1400,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
 }
 
 // utils
+
 void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->buffer == NULL);
     //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index a17eeb2936c52..52bb8902c7a6f 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -9712,6 +9712,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
         if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
             continue;
 
+#ifndef NDEBUG
         assert(node->backend == GGML_BACKEND_GPU);
         assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
         assert(node->extra != nullptr);
@@ -9723,35 +9724,13 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
                 assert(node->src[j]->extra != nullptr);
             }
         }
+#endif
 
         bool ok = ggml_cuda_compute_forward(&params, node);
         if (!ok) {
             fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
         GGML_ASSERT(ok);
-
-#if 0
-        if (node->type == GGML_TYPE_F32) {
-            cudaDeviceSynchronize();
-            std::vector<float> tmp(ggml_nelements(node), 0.0f);
-            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
-            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
-                ggml_type_name(node->src[0]->type),
-                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
-                node->src[0]->name,
-                node->src[1] ? node->src[1]->name : "none");
-            double sum = 0.0;
-            double sq_sum = 0.0;
-            for (int i = 0; i < ggml_nelements(node); i++) {
-                printf("%f ", tmp[i]);
-                sum += tmp[i];
-                sq_sum += tmp[i]*tmp[i];
-            }
-            printf("\n");
-            printf("sum: %f, ", sum);
-            printf("sq_sum: %f\n", sq_sum);
-        }
-#endif
     }
 
     UNUSED(backend);
diff --git a/llama.cpp b/llama.cpp
index d8fe1554f6f2d..5104350ca88d6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1730,7 +1730,6 @@ static bool llama_kv_cache_init(
             return false;
         }
         ggml_backend_buffer_clear(buf, 0);
-        // FIXME: buffer type name
         LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
         cache.bufs.push_back(buf);
     }
@@ -2463,9 +2462,9 @@ struct llama_model_loader {
         for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
             struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
             if (!cur) {
+                // some tensors may be allocated in a different context
                 continue;
             }
-            GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
 
             if (progress_callback) {
                 if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
@@ -3734,6 +3733,8 @@ static bool llm_load_tensors(
         if (buf == nullptr) {
             throw std::runtime_error("failed to allocate buffer");
         }
+        // indicate that this buffer contains weights
+        // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are always scheduled to the backend that contains the weight
         ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
         model.bufs.push_back(buf);
         ctx_bufs.emplace_back(ctx, buf);
@@ -4336,6 +4337,12 @@ struct llm_build_context {
                     cb(Vcur, "Vcur", il);
                 }
 
+                // these nodes are added to the graph together so that they are not reordered
+                // by doing so, the number of splits in the graph is reduced
+                ggml_build_forward_expand(gf, Qcur);
+                ggml_build_forward_expand(gf, Kcur);
+                ggml_build_forward_expand(gf, Vcur);
+
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens), inp_pos,
                     n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,