From ce469995ff2bbd1119f01537be4aeecbc6641404 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 17:49:58 +0100 Subject: [PATCH 01/43] Implement LLaDA2MoeModel conversion in convert_hf_to_gguf.py --- convert_hf_to_gguf.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8743202ad6..f2aa55828c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8714,6 +8714,40 @@ def prepare_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("LLaDA2MoeModelLM") +class LLaDA2MoeModel(BailingMoeV2Model): + model_arch = gguf.MODEL_ARCH.LLADA2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Set block_count directly from num_hidden_layers + self.block_count = self.hparams["num_hidden_layers"] + # Use the same tensor mapping as BailingMoeV2Model since they have identical tensor structure + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + + # Override specific parameters for LLaDA2.0 + # Add LLaDA2.0 specific parameters using generic add_key_value method + if "max_window_layers" in hparams: + self.gguf_writer.add_key_value("llada.max_window_layers", hparams["max_window_layers"], gguf.GGUFValueType.UINT32) + + if "output_router_logits" in hparams: + self.gguf_writer.add_key_value("llada.output_router_logits", hparams["output_router_logits"], gguf.GGUFValueType.BOOL) + + # Handle sliding window configuration + if "use_sliding_window" in hparams: + self.gguf_writer.add_key_value("llada.use_sliding_window", hparams["use_sliding_window"], gguf.GGUFValueType.BOOL) + + # Set the correct pad token ID for LLaDA2.0 + if "pad_token_id" in hparams: + self.gguf_writer.add_pad_token_id(hparams["pad_token_id"]) + + def set_vocab(self): + # Use the same vocab as BailingMoeV2Model + self._set_vocab_gpt2() @ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") class GroveMoeModel(TextModel): From 9716bd4c2c6b66bf8c2964d93a5bd0ed59570c2a Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 17:52:04 +0100 Subject: [PATCH 02/43] Add LLADA2 architecture to constants --- gguf-py/gguf/constants.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1cd0efad4a..049d7f6bd8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -422,6 +422,7 @@ class MODEL_ARCH(IntEnum): SMALLTHINKER = auto() LLADA = auto() LLADA_MOE = auto() + LLADA2 = auto() SEED_OSS = auto() GROVEMOE = auto() APERTUS = auto() @@ -792,6 +793,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.SMALLTHINKER: "smallthinker", MODEL_ARCH.LLADA: "llada", MODEL_ARCH.LLADA_MOE: "llada-moe", + MODEL_ARCH.LLADA2: "llada2", MODEL_ARCH.SEED_OSS: "seed_oss", MODEL_ARCH.GROVEMOE: "grovemoe", MODEL_ARCH.APERTUS: "apertus", @@ -2936,6 +2938,35 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_UP_EXP, MODEL_TENSOR.FFN_DOWN_EXP, ], + MODEL_ARCH.LLADA2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_EMBED_TOKENS, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.GROVEMOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, From a9e81a6a6d76986a7cde7c0cd14f534fa447cad7 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 19:21:04 +0100 Subject: [PATCH 03/43] Implement LLaDA2.0 support to diffusion-cli.cpp Added LLaDA2.0 support --- examples/diffusion/diffusion-cli.cpp | 248 ++++++++++++++++++++++----- 1 file changed, 201 insertions(+), 47 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 273942a165..4b55e2a13f 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -49,8 +49,10 @@ struct diffusion_params { int32_t block_length = 0; // Block size (for block scheduling) float alg_temp = 0; // algorithm temperature (0.0 = deterministic) bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0 + float threshold = 0.95f; // Confidence threshold for transfer int32_t max_length = 0; // Maximum sequence length + bool is_llada2 = false; // LLaDA2.0 specific processing }; struct callback_data { @@ -232,6 +234,11 @@ static void diffusion_generate(llama_context * ctx, std::vector mask_positions; mask_positions.reserve(params.max_length); + // Get EOS token for early termination + const llama_vocab * vocab = llama_model_get_vocab(model); + llama_token eos_token_id = llama_vocab_eos(vocab); + LOG_INF("DEBUG: EOS token ID = %d\n", eos_token_id); + // Setup sampler chain struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params()); if (params.top_k > 0) { @@ -267,8 +274,15 @@ static void diffusion_generate(llama_context * ctx, if (params.schedule == BLOCK_BASED) { GGML_ASSERT(params.max_length % params.block_length == 0); num_blocks = params.max_length / params.block_length; - GGML_ASSERT(params.steps % num_blocks == 0); - steps_per_block = params.steps / num_blocks; + + if (params.is_llada2) { + // LLaDA2.0: steps parameter is steps PER block + steps_per_block = params.steps; + } else { + // Dream/LLaDA1.0: steps parameter is TOTAL steps across all blocks + GGML_ASSERT(params.steps % num_blocks == 0); + steps_per_block = params.steps / num_blocks; + } } std::vector confidence(params.max_length); @@ -277,35 +291,69 @@ static void diffusion_generate(llama_context * ctx, int64_t total_time = 0; int64_t time_start = ggml_time_us(); - for (int block_num = 0; block_num < num_blocks; block_num++) { - int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0; - int32_t block_end = (params.schedule == BLOCK_BASED) ? - std::min(n_input + (block_num + 1) * params.block_length, params.max_length) : - params.max_length; + bool all_tokens_filled = false; + for (int block_num = 0; block_num < num_blocks && !all_tokens_filled; block_num++) { + int32_t block_start, block_end; + + if (params.is_llada2) { + // LLaDA2.0: blocks start from position 0 + block_start = (params.schedule == BLOCK_BASED) ? block_num * params.block_length : 0; + block_end = (params.schedule == BLOCK_BASED) ? + std::min((block_num + 1) * params.block_length, params.max_length) : + params.max_length; + + // Skip blocks fully within the prompt (already processed) + if (block_end <= n_input) { + continue; + } + } else { + // Dream/LLaDA1.0: blocks start after input prompt + block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0; + block_end = (params.schedule == BLOCK_BASED) ? + std::min(n_input + (block_num + 1) * params.block_length, params.max_length) : + params.max_length; + } // Count masked tokens in current block for block-based processing if (params.schedule == BLOCK_BASED) { - int32_t block_mask_count = 0; - for (int i = block_start; i < block_end; i++) { - if (output_tokens[i] == params.mask_token_id) { - block_mask_count++; + if (params.is_llada2) { + // LLaDA2.0: use block_length for scheduling (Python reference behavior) + num_transfer_tokens = get_num_transfer_tokens(params.block_length, steps_per_block); + } else { + // Dream/LLaDA1.0: count actual masked tokens in current block + int32_t block_mask_count = 0; + for (int i = block_start; i < block_end; i++) { + if (output_tokens[i] == params.mask_token_id) { + block_mask_count++; + } } + num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block); } - num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block); } for (int32_t step = 0; step < steps_per_block; step++) { int32_t global_step = block_num * steps_per_block + step; - + int32_t total_steps = (params.schedule == BLOCK_BASED) ? (num_blocks * steps_per_block) : params.steps; + if (params.step_callback) { if (!params.step_callback( - global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) { + global_step, total_steps, output_tokens, params.max_length, params.step_callback_user_data)) { break; } } // Setup batch - for (int32_t i = 0; i < params.max_length; i++) { + int32_t batch_size; + if (params.is_llada2) { + // LLaDA2.0: truncate to block_end to avoid attending to future masks + batch_size = block_end; + } else { + // Dream/LLaDA1.0: process full sequence + batch_size = params.max_length; + } + + batch.n_tokens = batch_size; + for (int32_t i = 0; i < batch_size; i++) { batch.token[i] = output_tokens[i]; batch.pos[i] = i; batch.n_seq_id[i] = 1; @@ -446,9 +494,13 @@ static void diffusion_generate(llama_context * ctx, step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens); if (transfer_count > 0) { - if (params.alg_temp == 0.0f) { + int32_t actual_transfer_count; + + if (params.is_llada2) { + // LLaDA2.0: threshold-based confidence approach + // Sort by confidence (descending) std::partial_sort(confidences.begin(), - confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()), + confidences.end(), confidences.end(), [](const std::pair & a, const std::pair & b) { if (a.first != b.first) { @@ -457,42 +509,124 @@ static void diffusion_generate(llama_context * ctx, return a.second < b.second; }); - for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) { - int32_t mask_idx = confidences[i].second; - int32_t pos = mask_positions[mask_idx]; - output_tokens[pos] = sampled_tokens[mask_idx]; + // Count high confidence tokens + int32_t high_conf_count = 0; + float threshold = params.threshold; + for (const auto& item : confidences) { + if (item.first > threshold) { + high_conf_count++; + } } + + actual_transfer_count = transfer_count; + if (high_conf_count >= transfer_count) { + actual_transfer_count = high_conf_count; + } + actual_transfer_count = std::min(actual_transfer_count, (int32_t)confidences.size()); + } else { - conf_candidates.clear(); - for (size_t i = 0; i < confidences.size(); i++) { - float conf_logit = confidences[i].first / params.alg_temp; - conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f }); + // Dream/LLaDA1.0: alg_temp-based approach (original implementation) + if (params.alg_temp == 0.0f) { + // Deterministic selection: sort and take top transfer_count + std::partial_sort(confidences.begin(), + confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()), + confidences.end(), + [](const std::pair & a, const std::pair & b) { + if (a.first != b.first) { + return a.first > b.first; + } + return a.second < b.second; + }); + actual_transfer_count = std::min(transfer_count, (int32_t) confidences.size()); + } else { + // Stochastic selection using alg_temp + conf_candidates.clear(); + for (size_t i = 0; i < confidences.size(); i++) { + float conf_logit = confidences[i].first / params.alg_temp; + conf_candidates.emplace_back(llama_token_data{ (int32_t) i, conf_logit, 0.0f }); + } + + llama_token_data_array conf_array = { + conf_candidates.data(), + conf_candidates.size(), + -1, + false, + }; + + // Sample transfer_count positions stochastically + actual_transfer_count = std::min(transfer_count, (int32_t) confidences.size()); + for (int32_t i = 0; i < actual_transfer_count; i++) { + llama_sampler_apply(dist_sampler, &conf_array); + int32_t selected_idx = conf_array.selected; + int32_t mask_idx = selected_idx; + int32_t pos = mask_positions[mask_idx]; + output_tokens[pos] = sampled_tokens[mask_idx]; + + // Mark as used by setting p to 0 + conf_candidates[selected_idx].p = 0.0f; + conf_array.selected = -1; + } + // Skip the common transfer loop below for stochastic case + actual_transfer_count = 0; } + } - llama_token_data_array conf_array = { - conf_candidates.data(), - conf_candidates.size(), - -1, - false, - }; - - for (int32_t i = 0; i < std::min(transfer_count, (int32_t) confidences.size()); i++) { - llama_sampler_apply(dist_sampler, &conf_array); - int32_t selected_idx = conf_array.selected; - int32_t mask_idx = selected_idx; - int32_t pos = mask_positions[mask_idx]; - output_tokens[pos] = sampled_tokens[mask_idx]; - - conf_candidates[selected_idx].p = 0.0f; - conf_array.selected = -1; + // Transfer tokens (deterministic case for both models) + for (int32_t i = 0; i < actual_transfer_count; i++) { + int32_t mask_idx = confidences[i].second; + int32_t pos = mask_positions[mask_idx]; + llama_token transferred_token = sampled_tokens[mask_idx]; + output_tokens[pos] = transferred_token; + + // EOS early stop (LLaDA2.0 only) + if (params.is_llada2 && transferred_token == eos_token_id) { + // Verify all tokens from n_input to pos are filled + bool all_filled_before_eos = true; + for (int32_t j = n_input; j < pos; j++) { + if (output_tokens[j] == params.mask_token_id) { + all_filled_before_eos = false; + break; + } + } + if (all_filled_before_eos) { + LOG_INF("\nEOS detected at position %d, all prior tokens filled. Terminating.\n", pos); + n_generated = pos + 1 - n_input; + all_tokens_filled = true; + break; + } } } + if (params.is_llada2 && all_tokens_filled) break; // Exit step loop + } else { + LOG_INF("DEBUG: Transfer count is 0!\n"); } } int64_t time_end_sampling = ggml_time_us(); total_sampling_time += time_end_sampling - time_start_sampling; } + + // Check for EOS after block completes (LLaDA2.0 only) + if (params.is_llada2) { + for (int32_t i = n_input; i < block_end; i++) { + if (output_tokens[i] == eos_token_id) { + // Check if all tokens before EOS are filled + bool all_filled = true; + for (int32_t j = n_input; j < i; j++) { + if (output_tokens[j] == params.mask_token_id) { + all_filled = false; + break; + } + } + if (all_filled) { + LOG_INF("\nEOS found at position %d after block %d. Terminating.\n", i, block_num); + n_generated = i + 1 - n_input; + all_tokens_filled = true; + break; + } + } + } + } } int64_t time_end = ggml_time_us(); @@ -567,11 +701,18 @@ int main(int argc, char ** argv) { llama_model_free(model); return 1; } - + + // Compute max_length early to ensure n_ubatch is large enough + int32_t max_length = params.n_predict > 0 ? params.n_predict : params.n_ctx; + + LOG_INF("DEBUG: params.n_ctx = %d\n", params.n_ctx); + LOG_INF("DEBUG: params.n_predict = %d\n", params.n_predict); + LOG_INF("DEBUG: max_length = %d\n", max_length); + llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = params.n_ctx; - ctx_params.n_batch = params.n_batch; - ctx_params.n_ubatch = params.n_ubatch; + ctx_params.n_batch = std::max(params.n_batch, max_length); // Ensure n_batch >= max_length NOT FINAL ITS JUST FOR TESTING! + ctx_params.n_ubatch = std::max(params.n_ubatch, max_length); // Ensure n_ubatch >= max_length ctx_params.flash_attn_type = params.flash_attn_type; ctx_params.no_perf = params.no_perf; ctx_params.type_k = params.cache_type_k; @@ -611,15 +752,27 @@ int main(int argc, char ** argv) { bool visual_mode = params.diffusion.visual_mode; int32_t n_generated = 0; - std::vector output_tokens(params.n_ubatch); + std::vector output_tokens(max_length); struct diffusion_params diff_params; + // Detect if this is LLaDA2.0 model for conditional behavior + bool is_llada2 = false; + char model_arch_str[64]; + if (llama_model_meta_val_str(model, "general.architecture", model_arch_str, sizeof(model_arch_str)) >= 0) { + is_llada2 = (strcmp(model_arch_str, "llada2") == 0); + } + char shift_logits_str[8]; if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) { diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0); } else { - diff_params.shift_logits = true; + // Model-dependent default + if (is_llada2) { + diff_params.shift_logits = false; // LLaDA2.0 default: unshifted logits + } else { + diff_params.shift_logits = true; // Dream/LLaDA1.0 default: shifted logits + } } //Use either eps or block length, but not both @@ -638,11 +791,12 @@ int main(int argc, char ** argv) { diff_params.temperature = params.sampling.temp; diff_params.steps = params.diffusion.steps; diff_params.algorithm = static_cast(params.diffusion.algorithm); - diff_params.max_length = params.n_ubatch; + diff_params.max_length = max_length; diff_params.top_p = params.sampling.top_p; diff_params.top_k = params.sampling.top_k; diff_params.visual_mode = params.diffusion.visual_mode; diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise; + diff_params.is_llada2 = is_llada2; diff_params.step_callback = diffusion_step_callback; callback_data cb_data = { &diff_params, vocab, n_input }; From bfc0b312af54c11fc6f6502ef30ef34a8897d811 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 19:22:03 +0100 Subject: [PATCH 04/43] Add llada2.cpp to CMakeLists.txt --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8ec95ee176..feacd0c583 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -85,6 +85,7 @@ add_library(llama models/lfm2.cpp models/llada-moe.cpp models/llada.cpp + models/llada2.cpp models/llama-iswa.cpp models/llama.cpp models/mamba.cpp From 85f5285e43575287585fade4d6e793cc9fe25b00 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 19:23:39 +0100 Subject: [PATCH 05/43] Add LLADA2 architecture support --- src/llama-arch.cpp | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b2eb2477f9..1263e0a232 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -103,6 +103,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_LLADA, "llada" }, { LLM_ARCH_LLADA_MOE, "llada-moe" }, + { LLM_ARCH_LLADA2, "llada2" }, { LLM_ARCH_SEED_OSS, "seed_oss" }, { LLM_ARCH_GROVEMOE, "grovemoe" }, { LLM_ARCH_APERTUS, "apertus" }, @@ -2057,6 +2058,38 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, }, }, + { + LLM_ARCH_LLADA2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, + { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, + { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, + { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, + { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, + { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + }, + }, { LLM_ARCH_DOTS1, { @@ -2722,6 +2755,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) { case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: case LLM_ARCH_LLADA_MOE: + case LLM_ARCH_LLADA2: return true; default: return false; From d5a477912d6a909cddc7a03d36121a3d81a40124 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 19:24:15 +0100 Subject: [PATCH 06/43] Add LLM_ARCH_LLADA2 to architecture list --- src/llama-arch.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-arch.h b/src/llama-arch.h index ae7fa222ac..8866ff372f 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -107,6 +107,7 @@ enum llm_arch { LLM_ARCH_SMALLTHINKER, LLM_ARCH_LLADA, LLM_ARCH_LLADA_MOE, + LLM_ARCH_LLADA2, LLM_ARCH_SEED_OSS, LLM_ARCH_GROVEMOE, LLM_ARCH_APERTUS, From 3db37fd4836565b84bdd0b5d0c306d732f9f581c Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 19:38:00 +0100 Subject: [PATCH 07/43] Add llada2.0 to llama-model.cpp --- src/llama-model.cpp | 95 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 175549a9e3..d58b13bc56 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1995,6 +1995,29 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_LLADA2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + + // TODO: when MTP is implemented, this should probably be updated if needed + hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + + switch (hparams.n_layer) { + case 20: type = LLM_TYPE_16B_A1B; break; + case 21: type = LLM_TYPE_16B_A1B; break; + case 32: type = LLM_TYPE_100B_A6B; break; + case 33: type = LLM_TYPE_100B_A6B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_DOTS1: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -5650,6 +5673,70 @@ bool llama_model::load_tensors(llama_model_loader & ml) { GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2"); GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2"); + for (int i = 0; i < n_layer; ++i) { + int flags = 0; + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + // skip all tensors in the NextN layers + flags |= TENSOR_SKIP; + } + + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); + + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags); + + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); + + if (static_cast(i) >= hparams.n_layer_dense_lead) { // MoE layers + const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared; + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags); + layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags); + + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); + + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags); + } else { // Dense layers + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); + } + + // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers + if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags); + layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags); + } + } + } break; + case LLM_ARCH_LLADA2: + { + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada2"); + GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada2"); + for (int i = 0; i < n_layer; ++i) { int flags = 0; if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { @@ -6742,7 +6829,7 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm); } - if (arch == LLM_ARCH_BAILINGMOE2) { + if (arch == LLM_ARCH_BAILINGMOE2 || arch == LLM_ARCH_LLADA2) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp); @@ -6881,6 +6968,7 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: + case LLM_ARCH_LLADA2: case LLM_ARCH_LLADA_MOE: { res = nullptr; @@ -7330,6 +7418,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_LLADA2: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_SEED_OSS: { llm = std::make_unique(*this, params); @@ -7632,6 +7724,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_COGVLM: case LLM_ARCH_PANGU_EMBED: case LLM_ARCH_AFMOE: + case LLM_ARCH_LLADA2: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: From b763f9b46df84baf9aa017dc248aa5cc72bca1f4 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 19:40:24 +0100 Subject: [PATCH 08/43] Create llada2.cpp --- src/models/llada2.cpp | 134 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 src/models/llada2.cpp diff --git a/src/models/llada2.cpp b/src/models/llada2.cpp new file mode 100644 index 0000000000..8fba3c8acd --- /dev/null +++ b/src/models/llada2.cpp @@ -0,0 +1,134 @@ +#include "models.h" + +llm_build_llada2::llm_build_llada2(const llama_model & model, const llm_graph_params & params) : + llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // Non-causal attention for diffusion (from LLaDA) + auto * inp_attn = build_attn_inp_no_cache(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; + for (int il = 0; il < n_transformer_layers; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self_attention + { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), + cur->nb[1], 0 * sizeof(float) * (n_embd)); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), + cur->nb[1], 1 * sizeof(float) * (n_embd)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), + cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); + + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); + } + + if (il == n_transformer_layers - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA); + cb(sa_out, "sa_out", il); + + // MoE branch + cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + if (static_cast(il) < hparams.n_layer_dense_lead) { + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + ggml_tensor * moe_out = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + true, hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il); + cb(moe_out, "ffn_moe_out", il); + + { + ggml_tensor * ffn_shexp = + build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } + } + + cur = ggml_add(ctx0, cur, sa_out); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} From 07180ebbf753965b8000f22e0429741fb647b5e9 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Sun, 23 Nov 2025 19:41:37 +0100 Subject: [PATCH 09/43] Add llm_build_llada2 struct to models.h --- src/models/models.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/models/models.h b/src/models/models.h index 4d7aeb4f42..6f2e408cd6 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -297,6 +297,10 @@ struct llm_build_llada : public llm_graph_context { llm_build_llada(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_llada2 : public llm_graph_context { + llm_build_llada2(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_llada_moe : public llm_graph_context { llm_build_llada_moe(const llama_model & model, const llm_graph_params & params); }; From e0714609facebdf1ad8b26cc3863bceed0fd7a0d Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Mon, 24 Nov 2025 16:45:27 +0100 Subject: [PATCH 10/43] Add proper fall-through for llada2.0 --- src/llama-model.cpp | 93 ++------------------------------------------- 1 file changed, 4 insertions(+), 89 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d58b13bc56..5954e564e1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1972,30 +1972,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; - case LLM_ARCH_BAILINGMOE2: - { - ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); - ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); - ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp); - ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); - ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); - ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); - ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func); - ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); - - // TODO: when MTP is implemented, this should probably be updated if needed - hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; - - switch (hparams.n_layer) { - case 20: type = LLM_TYPE_16B_A1B; break; - case 21: type = LLM_TYPE_16B_A1B; break; - case 32: type = LLM_TYPE_100B_A6B; break; - case 33: type = LLM_TYPE_100B_A6B; break; - default: type = LLM_TYPE_UNKNOWN; - } - } break; case LLM_ARCH_LLADA2: + case LLM_ARCH_BAILINGMOE2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); @@ -5659,71 +5637,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); } } break; - case LLM_ARCH_BAILINGMOE2: - { - const int64_t n_ff_exp = hparams.n_ff_exp; - const int64_t n_expert_shared = hparams.n_expert_shared; - - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - - // output - output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - - GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2"); - GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2"); - - for (int i = 0; i < n_layer; ++i) { - int flags = 0; - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { - // skip all tensors in the NextN layers - flags |= TENSOR_SKIP; - } - - auto & layer = layers[i]; - - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, flags); - - layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, flags); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, flags); - - layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, flags); - layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, flags); - - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); - - if (static_cast(i) >= hparams.n_layer_dense_lead) { // MoE layers - const int64_t n_ff_shexp = (hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp) * n_expert_shared; - - layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, flags); - layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, TENSOR_NOT_REQUIRED | flags); - - layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, flags); - layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, flags); - - layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags); - layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, flags); - layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, flags); - } else { // Dense layers - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, flags); - } - - // NextN/MTP tensors (preserved but unused) - conditionally load for last nextn_predict_layers - if (hparams.nextn_predict_layers > 0 && static_cast(i) >= n_layer - hparams.nextn_predict_layers) { - layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); - layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags); - layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); - layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); - layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED | flags); - layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED | flags); - layer.layer_out_norm = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, flags); - } - } - } break; case LLM_ARCH_LLADA2: + case LLM_ARCH_BAILINGMOE2: { const int64_t n_ff_exp = hparams.n_ff_exp; const int64_t n_expert_shared = hparams.n_expert_shared; @@ -5734,8 +5649,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for llada2"); - GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for llada2"); + GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2/llada2"); + GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2/llada2"); for (int i = 0; i < n_layer; ++i) { int flags = 0; From 985ff2924708d4a9a7f28f2b2f445ce12e022efa Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Mon, 24 Nov 2025 17:37:47 +0100 Subject: [PATCH 11/43] Cleanup 1 --- examples/diffusion/diffusion-cli.cpp | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 4b55e2a13f..3f979c0653 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -275,14 +275,8 @@ static void diffusion_generate(llama_context * ctx, GGML_ASSERT(params.max_length % params.block_length == 0); num_blocks = params.max_length / params.block_length; - if (params.is_llada2) { - // LLaDA2.0: steps parameter is steps PER block - steps_per_block = params.steps; - } else { - // Dream/LLaDA1.0: steps parameter is TOTAL steps across all blocks - GGML_ASSERT(params.steps % num_blocks == 0); - steps_per_block = params.steps / num_blocks; - } + GGML_ASSERT(params.steps % num_blocks == 0); + steps_per_block = params.steps / num_blocks; } std::vector confidence(params.max_length); @@ -316,19 +310,13 @@ static void diffusion_generate(llama_context * ctx, // Count masked tokens in current block for block-based processing if (params.schedule == BLOCK_BASED) { - if (params.is_llada2) { - // LLaDA2.0: use block_length for scheduling (Python reference behavior) - num_transfer_tokens = get_num_transfer_tokens(params.block_length, steps_per_block); - } else { - // Dream/LLaDA1.0: count actual masked tokens in current block - int32_t block_mask_count = 0; - for (int i = block_start; i < block_end; i++) { - if (output_tokens[i] == params.mask_token_id) { - block_mask_count++; - } + int32_t block_mask_count = 0; + for (int i = block_start; i < block_end; i++) { + if (output_tokens[i] == params.mask_token_id) { + block_mask_count++; } - num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block); } + num_transfer_tokens = get_num_transfer_tokens(block_mask_count, steps_per_block); } for (int32_t step = 0; step < steps_per_block; step++) { From d3839171ed165da17632a6131f4f142aae62c1e3 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Mon, 24 Nov 2025 17:53:40 +0100 Subject: [PATCH 12/43] Cleanup 2 --- examples/diffusion/diffusion-cli.cpp | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 3f979c0653..c81051dae8 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -287,26 +287,10 @@ static void diffusion_generate(llama_context * ctx, bool all_tokens_filled = false; for (int block_num = 0; block_num < num_blocks && !all_tokens_filled; block_num++) { - int32_t block_start, block_end; - - if (params.is_llada2) { - // LLaDA2.0: blocks start from position 0 - block_start = (params.schedule == BLOCK_BASED) ? block_num * params.block_length : 0; - block_end = (params.schedule == BLOCK_BASED) ? - std::min((block_num + 1) * params.block_length, params.max_length) : - params.max_length; - - // Skip blocks fully within the prompt (already processed) - if (block_end <= n_input) { - continue; - } - } else { - // Dream/LLaDA1.0: blocks start after input prompt - block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0; - block_end = (params.schedule == BLOCK_BASED) ? - std::min(n_input + (block_num + 1) * params.block_length, params.max_length) : - params.max_length; - } + int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0; + int32_t block_end = (params.schedule == BLOCK_BASED) ? + std::min(n_input + (block_num + 1) * params.block_length, params.max_length) : + params.max_length; // Count masked tokens in current block for block-based processing if (params.schedule == BLOCK_BASED) { From 0309fa24a1f8af15c72afc46c315115aff77a5f0 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Mon, 24 Nov 2025 21:30:37 +0100 Subject: [PATCH 13/43] Add EOS, Threshold and batch strategy --- gguf-py/gguf/constants.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 049d7f6bd8..319bb6059a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -303,6 +303,9 @@ class Projector: class Diffusion: SHIFT_LOGITS = "diffusion.shift_logits" + BATCH_STRATEGY = "diffusion.batch_strategy" + CHECK_EOS = "diffusion.eos_early_stop" + THRESHOLD = "diffusion.threshold" class xIELU: ALPHA_P = "xielu.alpha_p" From 885ae305862926f0dbb6da02af999bc0524bd326 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Mon, 24 Nov 2025 21:34:23 +0100 Subject: [PATCH 14/43] Add parameters to conversion script Added diffusion parameters for LLaDA2.0 in GGUF writer. --- convert_hf_to_gguf.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f2aa55828c..560bd2e043 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8730,7 +8730,6 @@ def set_gguf_parameters(self): hparams = self.hparams # Override specific parameters for LLaDA2.0 - # Add LLaDA2.0 specific parameters using generic add_key_value method if "max_window_layers" in hparams: self.gguf_writer.add_key_value("llada.max_window_layers", hparams["max_window_layers"], gguf.GGUFValueType.UINT32) @@ -8745,6 +8744,12 @@ def set_gguf_parameters(self): if "pad_token_id" in hparams: self.gguf_writer.add_pad_token_id(hparams["pad_token_id"]) + # Diffusion parameters + self.gguf_writer.add_string(gguf.Keys.Diffusion.BATCH_STRATEGY, "truncate") + self.gguf_writer.add_bool(gguf.Keys.Diffusion.EOS_EARLY_STOP, True) + self.gguf_writer.add_float32(gguf.Keys.Diffusion.THRESHOLD, 0.95) + self.gguf_writer.add_diffusion_shift_logits(False) + def set_vocab(self): # Use the same vocab as BailingMoeV2Model self._set_vocab_gpt2() From 603c86b5884c439149e4a427871e5884220ae523 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Tue, 25 Nov 2025 00:16:45 +0100 Subject: [PATCH 15/43] Cleanup3 --- examples/diffusion/diffusion-cli.cpp | 61 +++++++++++++++++++--------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index c81051dae8..9533edb656 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -49,10 +49,12 @@ struct diffusion_params { int32_t block_length = 0; // Block size (for block scheduling) float alg_temp = 0; // algorithm temperature (0.0 = deterministic) bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0 - float threshold = 0.95f; // Confidence threshold for transfer + float threshold = -1.0f; // Confidence threshold for transfer (-1.0 = not set, use alg_temp-based sampling) - int32_t max_length = 0; // Maximum sequence length - bool is_llada2 = false; // LLaDA2.0 specific processing + int32_t max_length = 0; // Maximum sequence length + bool is_llada2 = false; // LLaDA2.0 specific processing + bool eos_early_stop = false; // Enable early EOS termination + bool truncate_batch = false; // Truncate batch to block_end (vs full sequence) }; struct callback_data { @@ -316,11 +318,11 @@ static void diffusion_generate(llama_context * ctx, // Setup batch int32_t batch_size; - if (params.is_llada2) { - // LLaDA2.0: truncate to block_end to avoid attending to future masks + if (params.truncate_batch) { + // Truncate to block_end to avoid attending to future blocks batch_size = block_end; } else { - // Dream/LLaDA1.0: process full sequence + // Process full sequence batch_size = params.max_length; } @@ -468,8 +470,8 @@ static void diffusion_generate(llama_context * ctx, if (transfer_count > 0) { int32_t actual_transfer_count; - if (params.is_llada2) { - // LLaDA2.0: threshold-based confidence approach + if (params.threshold > 0.0f) { + // Threshold-based confidence approach // Sort by confidence (descending) std::partial_sort(confidences.begin(), confidences.end(), @@ -497,7 +499,7 @@ static void diffusion_generate(llama_context * ctx, actual_transfer_count = std::min(actual_transfer_count, (int32_t)confidences.size()); } else { - // Dream/LLaDA1.0: alg_temp-based approach (original implementation) + // alg_temp-based approach (fallback when threshold not set) if (params.alg_temp == 0.0f) { // Deterministic selection: sort and take top transfer_count std::partial_sort(confidences.begin(), @@ -550,8 +552,8 @@ static void diffusion_generate(llama_context * ctx, llama_token transferred_token = sampled_tokens[mask_idx]; output_tokens[pos] = transferred_token; - // EOS early stop (LLaDA2.0 only) - if (params.is_llada2 && transferred_token == eos_token_id) { + // EOS early stop + if (params.eos_early_stop && transferred_token == eos_token_id) { // Verify all tokens from n_input to pos are filled bool all_filled_before_eos = true; for (int32_t j = n_input; j < pos; j++) { @@ -568,7 +570,7 @@ static void diffusion_generate(llama_context * ctx, } } } - if (params.is_llada2 && all_tokens_filled) break; // Exit step loop + if (params.eos_early_stop && all_tokens_filled) break; // Exit step loop } else { LOG_INF("DEBUG: Transfer count is 0!\n"); } @@ -578,8 +580,8 @@ static void diffusion_generate(llama_context * ctx, total_sampling_time += time_end_sampling - time_start_sampling; } - // Check for EOS after block completes (LLaDA2.0 only) - if (params.is_llada2) { + // Check for EOS after block completes + if (params.eos_early_stop) { for (int32_t i = n_input; i < block_end; i++) { if (output_tokens[i] == eos_token_id) { // Check if all tokens before EOS are filled @@ -739,14 +741,33 @@ int main(int argc, char ** argv) { if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) { diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0); } else { - // Model-dependent default - if (is_llada2) { - diff_params.shift_logits = false; // LLaDA2.0 default: unshifted logits - } else { - diff_params.shift_logits = true; // Dream/LLaDA1.0 default: shifted logits - } + diff_params.shift_logits = true; + } + + // Read EOS early stop parameter from GGUF metadata + char eos_early_stop_str[8]; + if (llama_model_meta_val_str(model, "diffusion.eos_early_stop", eos_early_stop_str, sizeof(eos_early_stop_str)) >= 0) { + diff_params.eos_early_stop = (strcmp(eos_early_stop_str, "true") == 0); + } else { + // Default to false for backward compatibility + diff_params.eos_early_stop = false; } + // Read threshold parameter from GGUF metadata + char threshold_str[32]; + if (llama_model_meta_val_str(model, "diffusion.threshold", threshold_str, sizeof(threshold_str)) >= 0) { + diff_params.threshold = std::stof(threshold_str); + } + // If not present, threshold remains at -1.0f (use alg_temp-based sampling) + + // Read batch strategy parameter from GGUF metadata + char batch_strategy_str[32]; + if (llama_model_meta_val_str(model, "diffusion.batch_strategy", batch_strategy_str, sizeof(batch_strategy_str)) >= 0) { + diff_params.truncate_batch = (strcmp(batch_strategy_str, "truncate") == 0); + } else { + // Default to false for backward compatibility + diff_params.truncate_batch = false; + //Use either eps or block length, but not both GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0)); From 2c2a930714f7251a981d1cab05a2d1b4ebfe493a Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Tue, 25 Nov 2025 00:23:34 +0100 Subject: [PATCH 16/43] Remove LLaDA2.0 specific code and make it model independent --- examples/diffusion/diffusion-cli.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 9533edb656..a507d94c25 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -52,7 +52,6 @@ struct diffusion_params { float threshold = -1.0f; // Confidence threshold for transfer (-1.0 = not set, use alg_temp-based sampling) int32_t max_length = 0; // Maximum sequence length - bool is_llada2 = false; // LLaDA2.0 specific processing bool eos_early_stop = false; // Enable early EOS termination bool truncate_batch = false; // Truncate batch to block_end (vs full sequence) }; @@ -730,13 +729,6 @@ int main(int argc, char ** argv) { struct diffusion_params diff_params; - // Detect if this is LLaDA2.0 model for conditional behavior - bool is_llada2 = false; - char model_arch_str[64]; - if (llama_model_meta_val_str(model, "general.architecture", model_arch_str, sizeof(model_arch_str)) >= 0) { - is_llada2 = (strcmp(model_arch_str, "llada2") == 0); - } - char shift_logits_str[8]; if (llama_model_meta_val_str(model, "diffusion.shift_logits", shift_logits_str, sizeof(shift_logits_str)) >= 0) { diff_params.shift_logits = (strcmp(shift_logits_str, "true") == 0); @@ -767,6 +759,7 @@ int main(int argc, char ** argv) { } else { // Default to false for backward compatibility diff_params.truncate_batch = false; + } //Use either eps or block length, but not both GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0)); @@ -789,7 +782,6 @@ int main(int argc, char ** argv) { diff_params.top_k = params.sampling.top_k; diff_params.visual_mode = params.diffusion.visual_mode; diff_params.add_gumbel_noise = params.diffusion.add_gumbel_noise; - diff_params.is_llada2 = is_llada2; diff_params.step_callback = diffusion_step_callback; callback_data cb_data = { &diff_params, vocab, n_input }; From 4e5abd294b88f030650087c48f81f0754463f0d1 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Tue, 25 Nov 2025 00:31:09 +0100 Subject: [PATCH 17/43] small fix --- examples/diffusion/diffusion-cli.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index a507d94c25..04dc6ab838 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -306,11 +306,10 @@ static void diffusion_generate(llama_context * ctx, for (int32_t step = 0; step < steps_per_block; step++) { int32_t global_step = block_num * steps_per_block + step; - int32_t total_steps = (params.schedule == BLOCK_BASED) ? (num_blocks * steps_per_block) : params.steps; if (params.step_callback) { if (!params.step_callback( - global_step, total_steps, output_tokens, params.max_length, params.step_callback_user_data)) { + global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) { break; } } From 758c2f38307a9591314e917e0cd5275c1fbb0b0b Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Tue, 25 Nov 2025 00:32:09 +0100 Subject: [PATCH 18/43] small fix part 2 --- examples/diffusion/diffusion-cli.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 04dc6ab838..c93e94e636 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -306,7 +306,7 @@ static void diffusion_generate(llama_context * ctx, for (int32_t step = 0; step < steps_per_block; step++) { int32_t global_step = block_num * steps_per_block + step; - + if (params.step_callback) { if (!params.step_callback( global_step, params.steps, output_tokens, params.max_length, params.step_callback_user_data)) { From 76e16425dd4a5190a115b77335a28d17ae9c9bc5 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Tue, 25 Nov 2025 18:44:55 +0100 Subject: [PATCH 19/43] small fix 1 --- gguf-py/gguf/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 319bb6059a..39d707c79e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -304,7 +304,7 @@ class Projector: class Diffusion: SHIFT_LOGITS = "diffusion.shift_logits" BATCH_STRATEGY = "diffusion.batch_strategy" - CHECK_EOS = "diffusion.eos_early_stop" + EOS_EARLY_STOP = "diffusion.eos_early_stop" THRESHOLD = "diffusion.threshold" class xIELU: From fa087c84d8d85d13e52daf8cbb0c842d197c4462 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 04:44:07 +0100 Subject: [PATCH 20/43] Enable hybrid diffusion --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b47e3053fb..f4d231ad15 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8763,6 +8763,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_string(gguf.Keys.Diffusion.BATCH_STRATEGY, "truncate") self.gguf_writer.add_bool(gguf.Keys.Diffusion.EOS_EARLY_STOP, True) self.gguf_writer.add_float32(gguf.Keys.Diffusion.THRESHOLD, 0.95) + self.gguf_writer.add_bool(gguf.Keys.Diffusion.HYBRID_DIFFUSION, True) self.gguf_writer.add_diffusion_shift_logits(False) def set_vocab(self): From e763d3784bf7322346e48465d80259281b190f61 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 04:45:07 +0100 Subject: [PATCH 21/43] Add HYBRID_DIFFUSION constant to diffusion class --- gguf-py/gguf/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2715313e49..07ee3d18c7 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -320,6 +320,7 @@ class Diffusion: BATCH_STRATEGY = "diffusion.batch_strategy" EOS_EARLY_STOP = "diffusion.eos_early_stop" THRESHOLD = "diffusion.threshold" + HYBRID_DIFFUSION = "diffusion.hybrid_diffusion" class xIELU: ALPHA_P = "xielu.alpha_p" From e81ad4d70fb1162f0fef80b7a7b5a91c7873b261 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 04:46:05 +0100 Subject: [PATCH 22/43] Remove LLM_ARCH_LLADA2 from architecture switch Removed LLM_ARCH_LLADA2 case from switch statement. --- src/llama-model.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 1f6902294e..aa2c7b1923 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -6896,7 +6896,6 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, case LLM_ARCH_GEMMA_EMBEDDING: case LLM_ARCH_DREAM: case LLM_ARCH_LLADA: - case LLM_ARCH_LLADA2: case LLM_ARCH_LLADA_MOE: case LLM_ARCH_RND1: { From ebe9210366ffe9440ab901b5fbc699f855d54694 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 04:52:38 +0100 Subject: [PATCH 23/43] Implement hybrid diffusion optimization Added hybrid diffusion optimization support to the diffusion parameters and processing logic. --- examples/diffusion/diffusion-cli.cpp | 108 +++++++++++++++++++++++---- 1 file changed, 93 insertions(+), 15 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index c93e94e636..604b336e8c 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -51,9 +51,10 @@ struct diffusion_params { bool add_gumbel_noise = false; // Add gumbel noise to the logits if temp > 0.0 float threshold = -1.0f; // Confidence threshold for transfer (-1.0 = not set, use alg_temp-based sampling) - int32_t max_length = 0; // Maximum sequence length - bool eos_early_stop = false; // Enable early EOS termination - bool truncate_batch = false; // Truncate batch to block_end (vs full sequence) + int32_t max_length = 0; // Maximum sequence length + bool eos_early_stop = false; // Enable early EOS termination + bool truncate_batch = false; // Truncate batch to block_end (vs full sequence) + bool hybrid_diffusion = false; // Enable hybrid diffusion optimization with KV cache }; struct callback_data { @@ -287,6 +288,24 @@ static void diffusion_generate(llama_context * ctx, int64_t time_start = ggml_time_us(); bool all_tokens_filled = false; + + // Hybrid Diffusion: Pre-fill prompt if enabled and n_input > 0 + if (params.hybrid_diffusion && n_input > 0) { + // Decode prompt (0..n_input) to KV cache + batch.n_tokens = n_input; + for (int32_t i = 0; i < n_input; i++) { + batch.token[i] = output_tokens[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = false; // No logits needed for prompt + } + + if (llama_decode(ctx, batch) != 0) { + LOG_ERR("%s: failed to decode prompt\n", __func__); + return; + } + } for (int block_num = 0; block_num < num_blocks && !all_tokens_filled; block_num++) { int32_t block_start = (params.schedule == BLOCK_BASED) ? n_input + block_num * params.block_length : 0; int32_t block_end = (params.schedule == BLOCK_BASED) ? @@ -316,21 +335,62 @@ static void diffusion_generate(llama_context * ctx, // Setup batch int32_t batch_size; - if (params.truncate_batch) { - // Truncate to block_end to avoid attending to future blocks - batch_size = block_end; + int32_t batch_start_pos; + + // Hybrid Diffusion: Commit previous block to KV cache + if (params.hybrid_diffusion && block_num > 0 && step == 0) { + int32_t prev_block_start = (params.schedule == BLOCK_BASED) ? n_input + (block_num - 1) * params.block_length : 0; + int32_t prev_block_end = block_start; + + int32_t pb_size = prev_block_end - prev_block_start; + if (pb_size > 0) { + batch.n_tokens = pb_size; + for (int32_t i = 0; i < pb_size; i++) { + int32_t pos = prev_block_start + i; + batch.token[i] = output_tokens[pos]; + batch.pos[i] = pos; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = 0; + batch.logits[i] = false; + } + + // Remove old KV for this range to ensure we write the fresh finalized tokens + llama_memory_seq_rm(llama_get_memory(ctx), 0, prev_block_start, prev_block_end); + + if (llama_decode(ctx, batch) != 0) { + LOG_ERR("%s: failed to commit previous block %d\n", __func__, block_num - 1); + break; + } + } + } + + if (params.hybrid_diffusion && params.truncate_batch) { + // Hybrid Diffusion: Truncate to active block only + batch_start_pos = block_start; + batch_size = block_end - block_start; + } else if (params.truncate_batch) { + // Legacy: Truncate to block_end + batch_start_pos = 0; + batch_size = block_end; } else { // Process full sequence - batch_size = params.max_length; + batch_start_pos = 0; + batch_size = params.max_length; + } + + // Hybrid Diffusion: Remove old KV for the active region before re-decoding + if (params.hybrid_diffusion) { + llama_memory_seq_rm(llama_get_memory(ctx), 0, batch_start_pos, batch_start_pos + batch_size); } batch.n_tokens = batch_size; for (int32_t i = 0; i < batch_size; i++) { - batch.token[i] = output_tokens[i]; - batch.pos[i] = i; + int32_t pos = batch_start_pos + i; + batch.token[i] = output_tokens[pos]; + batch.pos[i] = pos; batch.n_seq_id[i] = 1; batch.seq_id[i][0] = 0; - batch.logits[i] = 1; + batch.logits[i] = true; } float * logits = nullptr; @@ -350,8 +410,9 @@ static void diffusion_generate(llama_context * ctx, un_x_buffer[i] = params.mask_token_id; } - for (int32_t i = 0; i < params.max_length; i++) { - batch.token[i] = un_x_buffer[i]; + for (int32_t i = 0; i < batch_size; i++) { + int32_t pos = batch_start_pos + i; + batch.token[i] = un_x_buffer[pos]; } ret = llama_decode(ctx, batch); if (ret != 0) { @@ -381,10 +442,17 @@ static void diffusion_generate(llama_context * ctx, } auto get_logits_for_pos = [&](int32_t pos) -> const float * { + // Hybrid Diffusion: Map absolute pos to relative pos in logits + int32_t rel_pos = params.hybrid_diffusion ? (pos - batch_start_pos) : pos; + + if (params.hybrid_diffusion && (pos < batch_start_pos || pos >= batch_start_pos + batch_size)) { + return nullptr; // Position out of active batch range + } + if (params.shift_logits) { - return pos == 0 ? logits : logits + (pos - 1) * n_vocab; + return rel_pos == 0 ? logits : logits + (rel_pos - 1) * n_vocab; } - return logits + (pos) *n_vocab; + return logits + (rel_pos) * n_vocab; }; int64_t time_start_sampling = ggml_time_us(); @@ -758,7 +826,17 @@ int main(int argc, char ** argv) { } else { // Default to false for backward compatibility diff_params.truncate_batch = false; - } + } + + // Read hybrid_diffusion parameter from GGUF metadata + char hybrid_diffusion_str[8]; + if (llama_model_meta_val_str(model, "diffusion.hybrid_diffusion", hybrid_diffusion_str, sizeof(hybrid_diffusion_str)) >= 0) { + diff_params.hybrid_diffusion = (strcmp(hybrid_diffusion_str, "true") == 0); + LOG_INF("Hybrid Diffusion: %s\n", diff_params.hybrid_diffusion ? "ENABLED" : "DISABLED"); + } else { + // Default to false for backward compatibility + diff_params.hybrid_diffusion = false; + } //Use either eps or block length, but not both GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0)); From eace3fbe49ae97fdb8f8557996c47be891f3ce76 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 05:17:23 +0100 Subject: [PATCH 24/43] Make model use kv cache --- src/models/llada2.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/models/llada2.cpp b/src/models/llada2.cpp index 8fba3c8acd..4261193c95 100644 --- a/src/models/llada2.cpp +++ b/src/models/llada2.cpp @@ -15,8 +15,7 @@ llm_build_llada2::llm_build_llada2(const llama_model & model, const llm_graph_pa // inp_pos - contains the positions ggml_tensor * inp_pos = build_inp_pos(); - // Non-causal attention for diffusion (from LLaDA) - auto * inp_attn = build_attn_inp_no_cache(); + auto * inp_attn = build_attn_inp_kv(); ggml_tensor * inp_out_ids = build_inp_out_ids(); From e84a77ac399e0fe03719c8a7a5f18bfa6d53b50d Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 05:23:46 +0100 Subject: [PATCH 25/43] Clear white space Co-authored-by: Aman Gupta --- examples/diffusion/diffusion-cli.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 604b336e8c..133122287d 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -276,7 +276,6 @@ static void diffusion_generate(llama_context * ctx, if (params.schedule == BLOCK_BASED) { GGML_ASSERT(params.max_length % params.block_length == 0); num_blocks = params.max_length / params.block_length; - GGML_ASSERT(params.steps % num_blocks == 0); steps_per_block = params.steps / num_blocks; } From 77d833b7020fb7ed343a7b00f5259a7842442ab3 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 05:28:40 +0100 Subject: [PATCH 26/43] revert ubatch --- examples/diffusion/diffusion-cli.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 133122287d..c265944f5c 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -750,8 +750,8 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = params.n_ctx; - ctx_params.n_batch = std::max(params.n_batch, max_length); // Ensure n_batch >= max_length NOT FINAL ITS JUST FOR TESTING! - ctx_params.n_ubatch = std::max(params.n_ubatch, max_length); // Ensure n_ubatch >= max_length + ctx_params.n_batch = params.n_batch; + ctx_params.n_ubatch = params.n_ubatch; ctx_params.flash_attn_type = params.flash_attn_type; ctx_params.no_perf = params.no_perf; ctx_params.type_k = params.cache_type_k; @@ -853,7 +853,7 @@ int main(int argc, char ** argv) { diff_params.temperature = params.sampling.temp; diff_params.steps = params.diffusion.steps; diff_params.algorithm = static_cast(params.diffusion.algorithm); - diff_params.max_length = max_length; + diff_params.max_length = params.n_ubatch; diff_params.top_p = params.sampling.top_p; diff_params.top_k = params.sampling.top_k; diff_params.visual_mode = params.diffusion.visual_mode; From dcc5f1fc22a838c64b2ef9265f8e0a5cfcd610e3 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 05:40:37 +0100 Subject: [PATCH 27/43] Change log level from INFO to DEBUG --- examples/diffusion/diffusion-cli.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index c265944f5c..08841c98bb 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -239,7 +239,7 @@ static void diffusion_generate(llama_context * ctx, // Get EOS token for early termination const llama_vocab * vocab = llama_model_get_vocab(model); llama_token eos_token_id = llama_vocab_eos(vocab); - LOG_INF("DEBUG: EOS token ID = %d\n", eos_token_id); + LOG_DBG("DEBUG: EOS token ID = %d\n", eos_token_id); // Setup sampler chain struct llama_sampler * sampler = llama_sampler_chain_init(llama_sampler_chain_default_params()); @@ -628,7 +628,7 @@ static void diffusion_generate(llama_context * ctx, } } if (all_filled_before_eos) { - LOG_INF("\nEOS detected at position %d, all prior tokens filled. Terminating.\n", pos); + LOG_DBG("\nEOS detected at position %d, all prior tokens filled. Terminating.\n", pos); n_generated = pos + 1 - n_input; all_tokens_filled = true; break; @@ -637,7 +637,7 @@ static void diffusion_generate(llama_context * ctx, } if (params.eos_early_stop && all_tokens_filled) break; // Exit step loop } else { - LOG_INF("DEBUG: Transfer count is 0!\n"); + LOG_DBG("DEBUG: Transfer count is 0!\n"); } } @@ -658,7 +658,7 @@ static void diffusion_generate(llama_context * ctx, } } if (all_filled) { - LOG_INF("\nEOS found at position %d after block %d. Terminating.\n", i, block_num); + LOG_DBG("\nEOS found at position %d after block %d. Terminating.\n", i, block_num); n_generated = i + 1 - n_input; all_tokens_filled = true; break; @@ -744,9 +744,9 @@ int main(int argc, char ** argv) { // Compute max_length early to ensure n_ubatch is large enough int32_t max_length = params.n_predict > 0 ? params.n_predict : params.n_ctx; - LOG_INF("DEBUG: params.n_ctx = %d\n", params.n_ctx); - LOG_INF("DEBUG: params.n_predict = %d\n", params.n_predict); - LOG_INF("DEBUG: max_length = %d\n", max_length); + LOG_DBG("DEBUG: params.n_ctx = %d\n", params.n_ctx); + LOG_DBG("DEBUG: params.n_predict = %d\n", params.n_predict); + LOG_DBG("DEBUG: max_length = %d\n", max_length); llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = params.n_ctx; @@ -831,7 +831,7 @@ int main(int argc, char ** argv) { char hybrid_diffusion_str[8]; if (llama_model_meta_val_str(model, "diffusion.hybrid_diffusion", hybrid_diffusion_str, sizeof(hybrid_diffusion_str)) >= 0) { diff_params.hybrid_diffusion = (strcmp(hybrid_diffusion_str, "true") == 0); - LOG_INF("Hybrid Diffusion: %s\n", diff_params.hybrid_diffusion ? "ENABLED" : "DISABLED"); + LOG_DBG("Hybrid Diffusion: %s\n", diff_params.hybrid_diffusion ? "ENABLED" : "DISABLED"); } else { // Default to false for backward compatibility diff_params.hybrid_diffusion = false; From a48f4ea3cdbb1a275fa85c3ba0397aacf291037a Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 11:20:30 +0100 Subject: [PATCH 28/43] Improve confidence handling Refactor confidence calculation and transfer logic for clarity and efficiency. --- examples/diffusion/diffusion-cli.cpp | 53 +++++++++++++++------------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 604b336e8c..9842653665 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -504,6 +504,10 @@ static void diffusion_generate(llama_context * ctx, std::vector> confidences; std::vector sampled_tokens(mask_positions.size()); + int32_t transfer_count = calculate_transfer_count( + step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens); + int32_t high_conf_count = 0; + for (size_t i = 0; i < mask_positions.size(); i++) { int32_t pos = mask_positions[i]; const float * pos_logits = get_logits_for_pos(pos); @@ -526,41 +530,42 @@ static void diffusion_generate(llama_context * ctx, float conf = calculate_confidence(cur_p, params.algorithm, rng); + if (params.threshold > 0.0f && conf > params.threshold) { + high_conf_count++; + } + sampled_tokens[i] = sampled_token; confidences.emplace_back(conf, i); } - int32_t transfer_count = calculate_transfer_count( - step, steps_per_block, mask_positions.size(), params.schedule, params.eps, num_transfer_tokens); - if (transfer_count > 0) { int32_t actual_transfer_count; if (params.threshold > 0.0f) { // Threshold-based confidence approach - // Sort by confidence (descending) - std::partial_sort(confidences.begin(), - confidences.end(), - confidences.end(), - [](const std::pair & a, const std::pair & b) { - if (a.first != b.first) { - return a.first > b.first; - } - return a.second < b.second; - }); - - // Count high confidence tokens - int32_t high_conf_count = 0; - float threshold = params.threshold; - for (const auto& item : confidences) { - if (item.first > threshold) { - high_conf_count++; - } - } - - actual_transfer_count = transfer_count; if (high_conf_count >= transfer_count) { + // If we have enough high-confidence tokens, + // use stable_partition to move them to the front, preserving relative order (by position). + // This avoids a full sort. + std::stable_partition(confidences.begin(), + confidences.end(), + [threshold = params.threshold](const std::pair& item) { + return item.first > threshold; + }); actual_transfer_count = high_conf_count; + } else { + // Fallback: Not enough high-confidence tokens to meet the schedule. + // Sort to find the top 'transfer_count' tokens. + std::partial_sort(confidences.begin(), + confidences.begin() + std::min(transfer_count, (int32_t) confidences.size()), + confidences.end(), + [](const std::pair & a, const std::pair & b) { + if (a.first != b.first) { + return a.first > b.first; + } + return a.second < b.second; + }); + actual_transfer_count = transfer_count; } actual_transfer_count = std::min(actual_transfer_count, (int32_t)confidences.size()); From 1c8e5c811ff3d01be9065068db2d8a97b05cc533 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 11:41:59 +0100 Subject: [PATCH 29/43] Implement EOS token assertion for early stop Add assertion for EOS token early stopping. --- examples/diffusion/diffusion-cli.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index b63ecbc4c8..d280cec4b4 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -239,6 +239,11 @@ static void diffusion_generate(llama_context * ctx, // Get EOS token for early termination const llama_vocab * vocab = llama_model_get_vocab(model); llama_token eos_token_id = llama_vocab_eos(vocab); + + if (params.eos_early_stop) { + GGML_ASSERT(eos_token_id != LLAMA_TOKEN_NULL); + } + LOG_DBG("DEBUG: EOS token ID = %d\n", eos_token_id); // Setup sampler chain From 680812d95163ed3be5d5e02b43984928bbfd87be Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 12:02:42 +0100 Subject: [PATCH 30/43] Update src/models/llada2.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/models/llada2.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/models/llada2.cpp b/src/models/llada2.cpp index 4261193c95..9f51d0c794 100644 --- a/src/models/llada2.cpp +++ b/src/models/llada2.cpp @@ -19,8 +19,7 @@ llm_build_llada2::llm_build_llada2(const llama_model & model, const llm_graph_pa ggml_tensor * inp_out_ids = build_inp_out_ids(); - const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; - for (int il = 0; il < n_transformer_layers; ++il) { + for (int il = 0; il < n_layer; ++il) { ggml_tensor * inpSA = inpL; // norm From 8e372790b69e6d0b5e04e458d9907b8510944ffd Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 12:02:52 +0100 Subject: [PATCH 31/43] Update src/models/llada2.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/models/llada2.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/llada2.cpp b/src/models/llada2.cpp index 9f51d0c794..ec2195cebd 100644 --- a/src/models/llada2.cpp +++ b/src/models/llada2.cpp @@ -59,7 +59,7 @@ llm_build_llada2::llm_build_llada2(const llama_model & model, const llm_graph_pa Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); } - if (il == n_transformer_layers - 1 && inp_out_ids) { + if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } From 0eaaac8b4aa13425b3900b346ddc1756ebac5a8e Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 12:30:29 +0100 Subject: [PATCH 32/43] Update convert_hf_to_gguf.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f4d231ad15..e8dfd3f036 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8733,13 +8733,6 @@ def prepare_tensors(self): class LLaDA2MoeModel(BailingMoeV2Model): model_arch = gguf.MODEL_ARCH.LLADA2 - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - # Set block_count directly from num_hidden_layers - self.block_count = self.hparams["num_hidden_layers"] - # Use the same tensor mapping as BailingMoeV2Model since they have identical tensor structure - self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams From 80cb62523310b8bbc3aca13355b1286c60f421b7 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 12:30:42 +0100 Subject: [PATCH 33/43] Update gguf-py/gguf/constants.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/constants.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 07ee3d18c7..871a0a6bc7 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2979,12 +2979,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_SHEXP, MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, - MODEL_TENSOR.NEXTN_EH_PROJ, - MODEL_TENSOR.NEXTN_EMBED_TOKENS, - MODEL_TENSOR.NEXTN_ENORM, - MODEL_TENSOR.NEXTN_HNORM, - MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, - MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, MODEL_TENSOR.LAYER_OUT_NORM, ], MODEL_ARCH.GROVEMOE: [ From 97dcb64a762180dc770cb8f3d00e3f5d547674e7 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 12:31:18 +0100 Subject: [PATCH 34/43] Update convert_hf_to_gguf.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index e8dfd3f036..7d50a82b67 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8758,10 +8758,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_float32(gguf.Keys.Diffusion.THRESHOLD, 0.95) self.gguf_writer.add_bool(gguf.Keys.Diffusion.HYBRID_DIFFUSION, True) self.gguf_writer.add_diffusion_shift_logits(False) - - def set_vocab(self): - # Use the same vocab as BailingMoeV2Model - self._set_vocab_gpt2() @ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") class GroveMoeModel(TextModel): From baae37ebffef369fb39b7712a9c53dd45d10a100 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 12:31:33 +0100 Subject: [PATCH 35/43] Update convert_hf_to_gguf.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7d50a82b67..8b71118262 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8735,28 +8735,6 @@ class LLaDA2MoeModel(BailingMoeV2Model): def set_gguf_parameters(self): super().set_gguf_parameters() - hparams = self.hparams - - # Override specific parameters for LLaDA2.0 - if "max_window_layers" in hparams: - self.gguf_writer.add_key_value("llada.max_window_layers", hparams["max_window_layers"], gguf.GGUFValueType.UINT32) - - if "output_router_logits" in hparams: - self.gguf_writer.add_key_value("llada.output_router_logits", hparams["output_router_logits"], gguf.GGUFValueType.BOOL) - - # Handle sliding window configuration - if "use_sliding_window" in hparams: - self.gguf_writer.add_key_value("llada.use_sliding_window", hparams["use_sliding_window"], gguf.GGUFValueType.BOOL) - - # Set the correct pad token ID for LLaDA2.0 - if "pad_token_id" in hparams: - self.gguf_writer.add_pad_token_id(hparams["pad_token_id"]) - - # Diffusion parameters - self.gguf_writer.add_string(gguf.Keys.Diffusion.BATCH_STRATEGY, "truncate") - self.gguf_writer.add_bool(gguf.Keys.Diffusion.EOS_EARLY_STOP, True) - self.gguf_writer.add_float32(gguf.Keys.Diffusion.THRESHOLD, 0.95) - self.gguf_writer.add_bool(gguf.Keys.Diffusion.HYBRID_DIFFUSION, True) self.gguf_writer.add_diffusion_shift_logits(False) @ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") From d7f7d1cb12b0ddcdd5d729fee14dc8a01d9d4199 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 12:33:15 +0100 Subject: [PATCH 36/43] Update src/llama-arch.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-arch.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 2429bd64f9..d9b4d86e61 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2094,12 +2094,6 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, - { LLM_TENSOR_NEXTN_EH_PROJ, "blk.%d.nextn.eh_proj" }, - { LLM_TENSOR_NEXTN_EMBED_TOKENS, "blk.%d.nextn.embed_tokens" }, - { LLM_TENSOR_NEXTN_ENORM, "blk.%d.nextn.enorm" }, - { LLM_TENSOR_NEXTN_HNORM, "blk.%d.nextn.hnorm" }, - { LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "blk.%d.nextn.shared_head_head" }, - { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, }, }, From a8ba60b362f193a71cc2b3ac69b9a4ea97b50955 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 12:39:42 +0100 Subject: [PATCH 37/43] Update gguf-py/gguf/constants.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/constants.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 871a0a6bc7..7807eabd68 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -317,10 +317,6 @@ class Projector: class Diffusion: SHIFT_LOGITS = "diffusion.shift_logits" - BATCH_STRATEGY = "diffusion.batch_strategy" - EOS_EARLY_STOP = "diffusion.eos_early_stop" - THRESHOLD = "diffusion.threshold" - HYBRID_DIFFUSION = "diffusion.hybrid_diffusion" class xIELU: ALPHA_P = "xielu.alpha_p" From 679de2d08faa62814321fe962d9f3ff137973bba Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 13:22:24 +0100 Subject: [PATCH 38/43] Refactor EOS and threshold parameters to use CLI --- examples/diffusion/diffusion-cli.cpp | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index d280cec4b4..62db05596c 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -812,21 +812,11 @@ int main(int argc, char ** argv) { diff_params.shift_logits = true; } - // Read EOS early stop parameter from GGUF metadata - char eos_early_stop_str[8]; - if (llama_model_meta_val_str(model, "diffusion.eos_early_stop", eos_early_stop_str, sizeof(eos_early_stop_str)) >= 0) { - diff_params.eos_early_stop = (strcmp(eos_early_stop_str, "true") == 0); - } else { - // Default to false for backward compatibility - diff_params.eos_early_stop = false; - } + // EOS early stop parameter from CLI + diff_params.eos_early_stop = params.diffusion.eos_early_stop; - // Read threshold parameter from GGUF metadata - char threshold_str[32]; - if (llama_model_meta_val_str(model, "diffusion.threshold", threshold_str, sizeof(threshold_str)) >= 0) { - diff_params.threshold = std::stof(threshold_str); - } - // If not present, threshold remains at -1.0f (use alg_temp-based sampling) + // Threshold parameter from CLI + diff_params.threshold = params.diffusion.threshold; // Read batch strategy parameter from GGUF metadata char batch_strategy_str[32]; From 11bd5a31d804c5ef585a823882d835861058686b Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 13:23:02 +0100 Subject: [PATCH 39/43] Add threshold and early stop flags to common.h --- common/common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/common/common.h b/common/common.h index 2f23d0baa8..7979527b01 100644 --- a/common/common.h +++ b/common/common.h @@ -266,6 +266,9 @@ struct common_params_diffusion { float cfg_scale = 0; // classifier-free guidance scale bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0 + + float threshold = -1.0f; // confidence threshold for transfer + bool eos_early_stop = false; // enable early EOS termination }; // reasoning API response format (not to be confused as chat template's reasoning format) From 8cf1588f76114fbf6881df0dc1a3d428697329f5 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 13:23:52 +0100 Subject: [PATCH 40/43] Add diffusion options for threshold and early stopping --- common/arg.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index dd787290d2..6bde8562e2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2949,6 +2949,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"), [](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-threshold"}, "F", + string_format("confidence threshold for transfer (default: %.2f)", (double) params.diffusion.threshold), + [](common_params & params, const std::string & value) { params.diffusion.threshold = std::stof(value); } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-eos-early-stop"}, + string_format("enable early EOS termination (default: %s)", params.diffusion.eos_early_stop ? "true" : "false"), + [](common_params & params) { params.diffusion.eos_early_stop = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( { "-lr", "--learning-rate" }, "ALPHA", string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0), From 191f1e0d490ceba43de3e886826b9e807de019ad Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 13:25:12 +0100 Subject: [PATCH 41/43] Add options for hybrid diffusion --- common/arg.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/common/arg.cpp b/common/arg.cpp index 6bde8562e2..78d3bde7a6 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2959,6 +2959,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex string_format("enable early EOS termination (default: %s)", params.diffusion.eos_early_stop ? "true" : "false"), [](common_params & params) { params.diffusion.eos_early_stop = true; } ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); + add_opt(common_arg( + {"--diffusion-hybrid"}, + string_format("enable hybrid diffusion optimization (default: %s)", params.diffusion.hybrid_diffusion ? "true" : "false"), + [](common_params & params) { params.diffusion.hybrid_diffusion = true; } + ).set_examples({ LLAMA_EXAMPLE_DIFFUSION })); add_opt(common_arg( { "-lr", "--learning-rate" }, "ALPHA", string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0), From de6416e721fa2de43d2fc28284f777defdea5e53 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 13:25:59 +0100 Subject: [PATCH 42/43] Add hybrid diffusion optimization flag --- common/common.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/common/common.h b/common/common.h index 7979527b01..718727c3bb 100644 --- a/common/common.h +++ b/common/common.h @@ -269,6 +269,8 @@ struct common_params_diffusion { float threshold = -1.0f; // confidence threshold for transfer bool eos_early_stop = false; // enable early EOS termination + bool hybrid_diffusion = false; // enable hybrid diffusion optimization + }; // reasoning API response format (not to be confused as chat template's reasoning format) From 6896bc30e3c8086c7f0368909981306abeea1431 Mon Sep 17 00:00:00 2001 From: wsbagnsv1 Date: Wed, 26 Nov 2025 13:28:27 +0100 Subject: [PATCH 43/43] Remove truncate_batch and simplify hybrid diffusion --- examples/diffusion/diffusion-cli.cpp | 27 +++------------------------ 1 file changed, 3 insertions(+), 24 deletions(-) diff --git a/examples/diffusion/diffusion-cli.cpp b/examples/diffusion/diffusion-cli.cpp index 62db05596c..28976ecfcc 100644 --- a/examples/diffusion/diffusion-cli.cpp +++ b/examples/diffusion/diffusion-cli.cpp @@ -53,7 +53,6 @@ struct diffusion_params { int32_t max_length = 0; // Maximum sequence length bool eos_early_stop = false; // Enable early EOS termination - bool truncate_batch = false; // Truncate batch to block_end (vs full sequence) bool hybrid_diffusion = false; // Enable hybrid diffusion optimization with KV cache }; @@ -368,14 +367,10 @@ static void diffusion_generate(llama_context * ctx, } } - if (params.hybrid_diffusion && params.truncate_batch) { + if (params.hybrid_diffusion) { // Hybrid Diffusion: Truncate to active block only batch_start_pos = block_start; batch_size = block_end - block_start; - } else if (params.truncate_batch) { - // Legacy: Truncate to block_end - batch_start_pos = 0; - batch_size = block_end; } else { // Process full sequence batch_start_pos = 0; @@ -818,24 +813,8 @@ int main(int argc, char ** argv) { // Threshold parameter from CLI diff_params.threshold = params.diffusion.threshold; - // Read batch strategy parameter from GGUF metadata - char batch_strategy_str[32]; - if (llama_model_meta_val_str(model, "diffusion.batch_strategy", batch_strategy_str, sizeof(batch_strategy_str)) >= 0) { - diff_params.truncate_batch = (strcmp(batch_strategy_str, "truncate") == 0); - } else { - // Default to false for backward compatibility - diff_params.truncate_batch = false; - } - - // Read hybrid_diffusion parameter from GGUF metadata - char hybrid_diffusion_str[8]; - if (llama_model_meta_val_str(model, "diffusion.hybrid_diffusion", hybrid_diffusion_str, sizeof(hybrid_diffusion_str)) >= 0) { - diff_params.hybrid_diffusion = (strcmp(hybrid_diffusion_str, "true") == 0); - LOG_DBG("Hybrid Diffusion: %s\n", diff_params.hybrid_diffusion ? "ENABLED" : "DISABLED"); - } else { - // Default to false for backward compatibility - diff_params.hybrid_diffusion = false; - } + // Hybrid diffusion parameter from CLI + diff_params.hybrid_diffusion = params.diffusion.hybrid_diffusion; //Use either eps or block length, but not both GGML_ASSERT((params.diffusion.eps == 0) ^ (params.diffusion.block_length == 0));