Skip to content

Commit

Permalink
llama : quantize token_embd.weight using output type
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Feb 22, 2024
1 parent f181e60 commit 488bd97
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions llama.cpp
Expand Up @@ -10481,7 +10481,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
return std::make_pair(i_layer, n_layer);
};

if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
// for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
// with the quantization of the output tensor
if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
(LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
int nx = tensor->ne[0];
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
new_type = GGML_TYPE_Q8_0;
Expand All @@ -10499,9 +10502,6 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
new_type = GGML_TYPE_Q4_K;
}
else if (arch == LLM_ARCH_GEMMA) {
new_type = GGML_TYPE_Q8_0;
}
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
if (name.find("attn_v.weight") != std::string::npos) {
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
Expand Down

0 comments on commit 488bd97

Please sign in to comment.