ggerganov · ochafik · Aug 18, 2023 · Aug 23, 2023 · Aug 25, 2023 · Aug 25, 2023
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -80,6 +80,7 @@ option(LLAMA_METAL                           "llama: use Metal"
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
+option(LLAMA_SKIP_UNUSED_LOGITS              "llama: skip computation of unused logits"         ON)
 
 option(LLAMA_BUILD_TESTS                "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES             "llama: build examples" ${LLAMA_STANDALONE})
@@ -390,6 +391,10 @@ if (LLAMA_HIPBLAS)
     endif()
 endif()
 
+if (LLAMA_SKIP_UNUSED_LOGITS)
+    add_compile_definitions(LLAMA_SKIP_UNUSED_LOGITS)
+endif()
+
 if (LLAMA_ALL_WARNINGS)
     if (NOT MSVC)
         set(c_flags

diff --git a/Makefile b/Makefile
@@ -326,6 +326,11 @@ k_quants.o: k_quants.c k_quants.h
 	$(CC) $(CFLAGS) -c $< -o $@
 endif # LLAMA_NO_K_QUANTS
 
+ifndef LLAMA_NO_SKIP_UNUSED_LOGITS
+  CFLAGS   += -DLLAMA_SKIP_UNUSED_LOGITS
+  CXXFLAGS += -DLLAMA_SKIP_UNUSED_LOGITS
+endif
+
 #
 # Print build information
 #

diff --git a/llama.cpp b/llama.cpp
@@ -2156,7 +2156,8 @@ static struct ggml_cgraph * llm_build_llama(
 
     GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
 
-    const int N = n_tokens;
+    // Non-const to allow short-circuiting to the last token in the last layer in prompt eval mode.
+    int N = n_tokens;
 
     const auto & model   = lctx.model;
     const auto & hparams = model.hparams;
@@ -2229,9 +2230,10 @@ static struct ggml_cgraph * llm_build_llama(
     //
     // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
     // in that case ggml_cuda_assign_buffers has no effect
-    offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
-    offload_func_t offload_func_kq = llama_nop;
-    offload_func_t offload_func_v  = llama_nop;
+    offload_func_t offload_func_nr   = llama_nop; // nr = non-repeating
+    offload_func_t offload_func_kq   = llama_nop;
+    offload_func_t offload_func_v    = llama_nop;
+    offload_func_t offload_func_skip = llama_nop;
 
 #ifdef GGML_USE_CUBLAS
     if (n_gpu_layers > n_layer) {
@@ -2243,6 +2245,9 @@ static struct ggml_cgraph * llm_build_llama(
     if (n_gpu_layers > n_layer + 2) {
         offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
     }
+    if (n_gpu_layers > 0) {
+        offload_func_skip = ggml_cuda_assign_buffers_no_alloc;
+    }
 #endif // GGML_USE_CUBLAS
 
     struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
@@ -2284,18 +2289,10 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_kq(tmpk);
             ggml_set_name(tmpk, "tmpk");
 
-            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-            offload_func_kq(tmpq);
-            ggml_set_name(tmpq, "tmpq");
-
             struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
             offload_func_kq(Kcur);
             ggml_set_name(Kcur, "Kcur");
 
-            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
-            offload_func_kq(Qcur);
-            ggml_set_name(Qcur, "Qcur");
-
             // store key and value to memory
             {
                 // compute the transposed [N, n_embd] V matrix
@@ -2323,6 +2320,37 @@ static struct ggml_cgraph * llm_build_llama(
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
             }
 
+#ifdef LLAMA_SKIP_UNUSED_LOGITS
+            if (il == n_layer - 1 && !lctx.logits_all)
+            {
+                // From here on, we only care about the last token and its logits.
+                // We do as if N = 1 (from the end), which means we only keep
+                // the last column of cur and inpSA ((n_embd, N) -> (n_embd, 1)).
+                //
+                // Note that we do this even when N==1 so that we don't change the # nodes in the graph,
+                // otherwise for Metal we'd have to rebuild the concurrency list.
+
+                cur   = ggml_view_2d(ctx0, cur,   n_embd, 1,   cur->nb[1], (N - 1)*ggml_element_size(cur)*n_embd);
+                offload_func_skip(cur);
+                ggml_set_name(cur, "cur-lastpos");
+
+                inpSA = ggml_view_2d(ctx0, inpSA, n_embd, 1, inpSA->nb[1], (N - 1)*ggml_element_size(inpSA)*n_embd);
+                offload_func_skip(inpSA);
+                ggml_set_name(inpSA, "inpSA-lastpos");
+
+                n_past += N - 1;
+                N = 1;
+            }
+#endif  // LLAMA_SKIP_UNUSED_LOGITS
+
+            struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
+            offload_func_kq(tmpq);
+            ggml_set_name(tmpq, "tmpq");
+
+            struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N),    n_past, n_embd_head, 0, 0, freq_base, freq_scale);
+            offload_func_kq(Qcur);
+            ggml_set_name(Qcur, "Qcur");
+
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
             offload_func_kq(Q);
             ggml_set_name(Q, "Q");
@@ -2936,11 +2964,18 @@ static bool llama_eval_internal(
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
+            GGML_ASSERT(ggml_nelements(res) == n_vocab * N);
             memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
+#ifdef LLAMA_SKIP_UNUSED_LOGITS
+            GGML_ASSERT(ggml_nelements(res) == n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab);
+#else
+            GGML_ASSERT(ggml_nelements(res) == n_vocab * N);
             memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+#endif
         }
     }