Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 0 additions & 27 deletions src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -666,19 +666,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::

std::map<int, std::string> mapped;
int blk_id = 0;
int pruned_attention_w = 0;

// make a list of weights
std::vector<const llama_model_loader::llama_tensor_weight *> tensors;
tensors.reserve(ml.weights_map.size());
for (const auto & it : ml.weights_map) {
const std::string remapped_name(remap_layer(it.first, prune_list, mapped, blk_id));
if (remapped_name.empty()) {
if (it.first.find("attn_v.weight") != std::string::npos ||
it.first.find("attn_qkv.weight") != std::string::npos ||
it.first.find("attn_kv_b.weight") != std::string::npos) {
pruned_attention_w++;
}
LLAMA_LOG_DEBUG("%s: pruning tensor %s\n", __func__, it.first.c_str());
continue;
}
Expand All @@ -703,7 +697,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
});
}

bool is_clip_model = false;
for (const auto * it : tensors) {
const struct ggml_tensor * tensor = it->tensor;

Expand All @@ -717,30 +710,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
} else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}

is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
}

qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;

// sanity checks for models that have attention layers
if (qs.n_attention_wv != 0 && !is_clip_model)
{
int32_t n_layer_all = model.hparams.n_layer;
if (llama_model_has_encoder(&model)) {
// now n_layer_all is the number of attention layers in the encoder
// for each decoder block, there are 2 attention layers
n_layer_all += 2 * model.hparams.dec_n_layer;
}

// note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);

LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w);

GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
}

size_t total_size_org = 0;
size_t total_size_new = 0;

Expand Down
Loading