From ece0b0d855eb5c53ec9838f6a57b6afc6c6ba573 Mon Sep 17 00:00:00 2001 From: slaren Date: Sat, 6 Jan 2024 00:58:22 +0100 Subject: [PATCH] improve graph splitting, partial fix for --no-kv-offload --- ggml-backend.c | 132 +++++++++++++++++++++++++++++++++++++++++++------ ggml-cuda.cu | 25 +--------- llama.cpp | 11 ++++- 3 files changed, 127 insertions(+), 41 deletions(-) diff --git a/ggml-backend.c b/ggml-backend.c index 8e791827255e0..31516b4c55efe 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -737,9 +737,16 @@ struct ggml_backend_sched_split { int i_end; struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; int n_inputs; + // graph view of this split struct ggml_cgraph graph; }; +// TODO: group all the hash values into a single struct for clarity +//struct sched_hash_value { +// ggml_tallocr_t tallocr; +// struct ggml_tensor * copies[GGML_MAX_BACKENDS]; +//}; + struct ggml_backend_sched { int n_backends; ggml_backend_t backends[GGML_MAX_BACKENDS]; @@ -747,11 +754,15 @@ struct ggml_backend_sched { ggml_gallocr_t galloc; + // hash keys of the nodes in the graph struct ggml_hash_set hash_set; - ggml_tallocr_t * node_talloc; // [hash_set.size] - struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS] + // hash values (arrays of [hash_set.size]) + ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend) + struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend + // copy of the graph with modified inputs struct ggml_cgraph * graph; + struct ggml_backend_sched_split splits[GGML_MAX_SPLITS]; int n_splits; @@ -928,6 +939,12 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co return dup; } + +//#define DEBUG_PASS1 +//#define DEBUG_PASS2 +//#define DEBUG_PASS3 +//#define DEBUG_PASS4 + // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend // TODO: merge passes static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { @@ -977,17 +994,32 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend); } } - //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS1 + fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif // pass 2: assign backends to ops from current assignments // start from the end and assign the same backend to previous ops + + // expand gpu backends (ie non last prio) up and down, ignoring cpu + // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops + + // pass 2.1 expand gpu up { ggml_tallocr_t cur_allocr = NULL; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } ggml_tallocr_t node_allocr = node_allocr(node); if (node_allocr != NULL) { - cur_allocr = node_allocr; + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + cur_allocr = NULL; + } + else { + cur_allocr = node_allocr; + } } else { node_allocr(node) = cur_allocr; SET_CAUSE(node, "2.cur"); @@ -995,12 +1027,58 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g } } - //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); + // pass 2.2 expand gpu down + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + cur_allocr = NULL; + } + else { + cur_allocr = node_allocr; + } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.cur"); + } + } + } + + // pass 2.3 expand rest up + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + cur_allocr = node_allocr; + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.cur"); + } + } + } +#ifdef DEBUG_PASS2 + fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif - // pass 3: assign backends to remaining src from dst (should only be leafs) + // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); + ggml_tallocr_t cur_allocr = node_allocr(node); + if (ggml_is_view_op(node->op) && cur_allocr == NULL) { + cur_allocr = node_allocr(node) = node_allocr(node->view_src); + SET_CAUSE(node, "3.vsrc"); + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -1008,11 +1086,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g } ggml_tallocr_t src_allocr = node_allocr(src); if (src_allocr == NULL) { - node_allocr(src) = node_allocr; + if (src->view_src != NULL) { + // views are always on the same backend as the source + node_allocr(src) = node_allocr(src->view_src); + } else { + node_allocr(src) = cur_allocr; + } } } } - //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS3 + fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif // pass 4: split graph, find tensors that need to be copied { @@ -1074,7 +1159,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src; } - // create copies + // create a copy of the input in the split's backend size_t id = hash_id(src); if (sched->node_copies[id][cur_backend_id] == NULL) { struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); @@ -1090,8 +1175,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g sched->splits[cur_split].i_end = graph->n_nodes; sched->n_splits = cur_split + 1; } - - //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS4 + fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif #ifndef NDEBUG // sanity check: all sources should have the same backend as the node @@ -1101,6 +1187,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g if (node_allocr == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } + if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) { + fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", + node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", + node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL"); + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -1112,8 +1203,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); } + if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) { + fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", + src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL", + src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL"); + } } } + fflush(stderr); #endif // create copies of the graph for each split @@ -1127,6 +1224,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + // add a dependency to the input source so that it is not freed before the copy is done input_cpy->src[0] = input; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } @@ -1163,19 +1261,20 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)]; if (input->buffer == NULL) { + GGML_ASSERT(false); if (input->view_src == NULL) { fprintf(stderr, "input %s has no buffer and no view_src\n", input->name); - exit(1); + GGML_ASSERT(false); } // FIXME: may need to use the sched buffer instead ggml_backend_view_init(input->view_src->buffer, input); } if (input_cpy->buffer == NULL) { fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name); - exit(1); + GGML_ASSERT(false); } - //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend); - //GGML_ASSERT(input_cpy->buffer->backend == split_backend); + // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change + // this is important to avoid copying constants such as KQ_mask and inp_pos multiple time ggml_backend_tensor_copy(input, input_cpy); } // ggml_backend_synchronize(split_backend); @@ -1301,6 +1400,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml } // utils + void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized diff --git a/ggml-cuda.cu b/ggml-cuda.cu index a17eeb2936c52..52bb8902c7a6f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -9712,6 +9712,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) continue; +#ifndef NDEBUG assert(node->backend == GGML_BACKEND_GPU); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->extra != nullptr); @@ -9723,35 +9724,13 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph assert(node->src[j]->extra != nullptr); } } +#endif bool ok = ggml_cuda_compute_forward(¶ms, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); - -#if 0 - if (node->type == GGML_TYPE_F32) { - cudaDeviceSynchronize(); - std::vector tmp(ggml_nelements(node), 0.0f); - cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); - printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), - ggml_type_name(node->src[0]->type), - node->src[1] ? ggml_type_name(node->src[1]->type) : "none", - node->src[0]->name, - node->src[1] ? node->src[1]->name : "none"); - double sum = 0.0; - double sq_sum = 0.0; - for (int i = 0; i < ggml_nelements(node); i++) { - printf("%f ", tmp[i]); - sum += tmp[i]; - sq_sum += tmp[i]*tmp[i]; - } - printf("\n"); - printf("sum: %f, ", sum); - printf("sq_sum: %f\n", sq_sum); - } -#endif } UNUSED(backend); diff --git a/llama.cpp b/llama.cpp index d8fe1554f6f2d..5104350ca88d6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1730,7 +1730,6 @@ static bool llama_kv_cache_init( return false; } ggml_backend_buffer_clear(buf, 0); - // FIXME: buffer type name LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); cache.bufs.push_back(buf); } @@ -2463,9 +2462,9 @@ struct llama_model_loader { for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); if (!cur) { + // some tensors may be allocated in a different context continue; } - GGML_ASSERT(cur); // unused tensors should have been caught by load_data already if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { @@ -3734,6 +3733,8 @@ static bool llm_load_tensors( if (buf == nullptr) { throw std::runtime_error("failed to allocate buffer"); } + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are always scheduled to the backend that contains the weight ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); model.bufs.push_back(buf); ctx_bufs.emplace_back(ctx, buf); @@ -4336,6 +4337,12 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, Qcur); + ggml_build_forward_expand(gf, Kcur); + ggml_build_forward_expand(gf, Vcur); + Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,