From c48679a8e87b1489c3a2b813354a6487e789d337 Mon Sep 17 00:00:00 2001 From: CausalLM <148736309+CausalLM@users.noreply.github.com> Date: Fri, 1 Dec 2023 21:54:58 +0800 Subject: [PATCH 1/3] Support attention_bias on LLaMA architecture QKVO bias, should fix InternLM (https://github.com/ggerganov/llama.cpp/issues/3133) and works for LLaMAfied Qwen models (https://github.com/ggerganov/llama.cpp/pull/3743#issuecomment-1825923608). --- llama.cpp | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index e74fd7234eb84..96f46e48932f0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1248,6 +1248,9 @@ struct llama_layer { struct ggml_tensor * wqkv; // attention bias + struct ggml_tensor * bq; + struct ggml_tensor * bk; + struct ggml_tensor * bv; struct ggml_tensor * bo; struct ggml_tensor * bqkv; @@ -2781,6 +2784,11 @@ static void llm_load_tensors( layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend); + layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend); + layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); @@ -2791,8 +2799,9 @@ static void llm_load_tensors( if (backend == GGML_BACKEND_GPU) { vram_weights += ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + - ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bq) + + ggml_nbytes(layer.bk) + ggml_nbytes(layer.bv) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; @@ -3891,13 +3900,25 @@ struct llm_build_context { // compute Q and K and RoPE them struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); cb(Vcur, "Vcur", il); - + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, @@ -3915,7 +3936,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kqv(ctx0, hparams, kv_self, - model.layers[il].wo, NULL, + model.layers[il].wo, model.layers[il].bo, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); } From e192572d211ffe35cf3f0b7c839fe8df00900770 Mon Sep 17 00:00:00 2001 From: CausalLM <148736309+CausalLM@users.noreply.github.com> Date: Sat, 2 Dec 2023 00:56:48 +0800 Subject: [PATCH 2/3] check existence of qkvo bias while loading llama models Tested on LLaMA2, CUDA and CPU. --- llama.cpp | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/llama.cpp b/llama.cpp index 96f46e48932f0..221221b80ce6b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2785,10 +2785,29 @@ static void llm_load_tensors( layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend); - layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend); - layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + try { + layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend); + } catch (const std::runtime_error& e) { + if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw; + } + + try { + layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend); + } catch (const std::runtime_error& e) { + if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw; + } + + try { + layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend); + } catch (const std::runtime_error& e) { + if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw; + } + + try { + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + } catch (const std::runtime_error& e) { + if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw; + } layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); @@ -2798,10 +2817,14 @@ static void llm_load_tensors( if (backend == GGML_BACKEND_GPU) { vram_weights += - ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + - ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bq) + - ggml_nbytes(layer.bk) + ggml_nbytes(layer.bv) + ggml_nbytes(layer.bo) + - ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) + + ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + + (layer.bq ? ggml_nbytes(layer.bq) : 0) + + (layer.bk ? ggml_nbytes(layer.bk) : 0) + + (layer.bv ? ggml_nbytes(layer.bv) : 0) + + (layer.bo ? ggml_nbytes(layer.bo) : 0) + + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) + + ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up); } } } break; From b1efaed381b6644620155acac8e7c21a09c2ce20 Mon Sep 17 00:00:00 2001 From: CausalLM <148736309+CausalLM@users.noreply.github.com> Date: Sat, 2 Dec 2023 01:09:17 +0800 Subject: [PATCH 3/3] Update llama.cpp --- llama.cpp | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/llama.cpp b/llama.cpp index 221221b80ce6b..33a24f1225157 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2784,29 +2784,29 @@ static void llm_load_tensors( layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - - try { - layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend); - } catch (const std::runtime_error& e) { - if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw; + + try { + layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend); + } catch (const std::runtime_error& e) { + if (std::string(e.what()).find("not found") != std::string::npos) layer.bq = NULL; else throw; } - try { - layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend); - } catch (const std::runtime_error& e) { - if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw; + try { + layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend); + } catch (const std::runtime_error& e) { + if (std::string(e.what()).find("not found") != std::string::npos) layer.bk = NULL; else throw; } - try { - layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend); - } catch (const std::runtime_error& e) { - if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw; + try { + layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend); + } catch (const std::runtime_error& e) { + if (std::string(e.what()).find("not found") != std::string::npos) layer.bv = NULL; else throw; } - try { - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); - } catch (const std::runtime_error& e) { - if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw; + try { + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + } catch (const std::runtime_error& e) { + if (std::string(e.what()).find("not found") != std::string::npos) layer.bo = NULL; else throw; } layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); @@ -3941,7 +3941,7 @@ struct llm_build_context { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); } - + Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, @@ -3959,7 +3959,7 @@ struct llm_build_context { llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il); cur = llm_build_kqv(ctx0, hparams, kv_self, - model.layers[il].wo, model.layers[il].bo, + model.layers[il].wo, model.layers[il].bo, Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il); cb(cur, "kqv_out", il); }