Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
ce46999
Implement LLaDA2MoeModel conversion in convert_hf_to_gguf.py
wsbagnsv1 Nov 23, 2025
9716bd4
Add LLADA2 architecture to constants
wsbagnsv1 Nov 23, 2025
a9e81a6
Implement LLaDA2.0 support to diffusion-cli.cpp
wsbagnsv1 Nov 23, 2025
bfc0b31
Add llada2.cpp to CMakeLists.txt
wsbagnsv1 Nov 23, 2025
85f5285
Add LLADA2 architecture support
wsbagnsv1 Nov 23, 2025
d5a4779
Add LLM_ARCH_LLADA2 to architecture list
wsbagnsv1 Nov 23, 2025
3db37fd
Add llada2.0 to llama-model.cpp
wsbagnsv1 Nov 23, 2025
b763f9b
Create llada2.cpp
wsbagnsv1 Nov 23, 2025
07180eb
Add llm_build_llada2 struct to models.h
wsbagnsv1 Nov 23, 2025
b9a938f
Merge branch 'ggml-org:master' into master
wsbagnsv1 Nov 23, 2025
d059973
Merge branch 'ggml-org:master' into master
wsbagnsv1 Nov 24, 2025
e071460
Add proper fall-through for llada2.0
wsbagnsv1 Nov 24, 2025
985ff29
Cleanup 1
wsbagnsv1 Nov 24, 2025
d383917
Cleanup 2
wsbagnsv1 Nov 24, 2025
0309fa2
Add EOS, Threshold and batch strategy
wsbagnsv1 Nov 24, 2025
885ae30
Add parameters to conversion script
wsbagnsv1 Nov 24, 2025
603c86b
Cleanup3
wsbagnsv1 Nov 24, 2025
2c2a930
Remove LLaDA2.0 specific code and make it model independent
wsbagnsv1 Nov 24, 2025
4e5abd2
small fix
wsbagnsv1 Nov 24, 2025
758c2f3
small fix part 2
wsbagnsv1 Nov 24, 2025
76e1642
small fix 1
wsbagnsv1 Nov 25, 2025
66610b7
Merge branch 'master' into master
wsbagnsv1 Nov 25, 2025
fa087c8
Enable hybrid diffusion
wsbagnsv1 Nov 26, 2025
e763d37
Add HYBRID_DIFFUSION constant to diffusion class
wsbagnsv1 Nov 26, 2025
e81ad4d
Remove LLM_ARCH_LLADA2 from architecture switch
wsbagnsv1 Nov 26, 2025
ebe9210
Implement hybrid diffusion optimization
wsbagnsv1 Nov 26, 2025
eace3fb
Make model use kv cache
wsbagnsv1 Nov 26, 2025
c488c41
Add Hybrid diffusion mechanism
wsbagnsv1 Nov 26, 2025
e84a77a
Clear white space
wsbagnsv1 Nov 26, 2025
77d833b
revert ubatch
wsbagnsv1 Nov 26, 2025
dcc5f1f
Change log level from INFO to DEBUG
wsbagnsv1 Nov 26, 2025
a48f4ea
Improve confidence handling
wsbagnsv1 Nov 26, 2025
876fa91
Refactor confidence calculation and transfer logic for clarity and ef…
wsbagnsv1 Nov 26, 2025
1c8e5c8
Implement EOS token assertion for early stop
wsbagnsv1 Nov 26, 2025
d20055f
Merge branch 'ggml-org:master' into master
wsbagnsv1 Nov 26, 2025
680812d
Update src/models/llada2.cpp
wsbagnsv1 Nov 26, 2025
8e37279
Update src/models/llada2.cpp
wsbagnsv1 Nov 26, 2025
0eaaac8
Update convert_hf_to_gguf.py
wsbagnsv1 Nov 26, 2025
80cb625
Update gguf-py/gguf/constants.py
wsbagnsv1 Nov 26, 2025
97dcb64
Update convert_hf_to_gguf.py
wsbagnsv1 Nov 26, 2025
baae37e
Update convert_hf_to_gguf.py
wsbagnsv1 Nov 26, 2025
d7f7d1c
Update src/llama-arch.cpp
wsbagnsv1 Nov 26, 2025
a8ba60b
Update gguf-py/gguf/constants.py
wsbagnsv1 Nov 26, 2025
679de2d
Refactor EOS and threshold parameters to use CLI
wsbagnsv1 Nov 26, 2025
11bd5a3
Add threshold and early stop flags to common.h
wsbagnsv1 Nov 26, 2025
8cf1588
Add diffusion options for threshold and early stopping
wsbagnsv1 Nov 26, 2025
191f1e0
Add options for hybrid diffusion
wsbagnsv1 Nov 26, 2025
de6416e
Add hybrid diffusion optimization flag
wsbagnsv1 Nov 26, 2025
6896bc3
Remove truncate_batch and simplify hybrid diffusion
wsbagnsv1 Nov 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2949,6 +2949,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
string_format("add gumbel noise to the logits if temp > 0.0 (default: %s)", params.diffusion.add_gumbel_noise ? "true" : "false"),
[](common_params & params, const std::string & value) { params.diffusion.add_gumbel_noise = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-threshold"}, "F",
string_format("confidence threshold for transfer (default: %.2f)", (double) params.diffusion.threshold),
[](common_params & params, const std::string & value) { params.diffusion.threshold = std::stof(value); }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-eos-early-stop"},
string_format("enable early EOS termination (default: %s)", params.diffusion.eos_early_stop ? "true" : "false"),
[](common_params & params) { params.diffusion.eos_early_stop = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{"--diffusion-hybrid"},
string_format("enable hybrid diffusion optimization (default: %s)", params.diffusion.hybrid_diffusion ? "true" : "false"),
[](common_params & params) { params.diffusion.hybrid_diffusion = true; }
).set_examples({ LLAMA_EXAMPLE_DIFFUSION }));
add_opt(common_arg(
{ "-lr", "--learning-rate" }, "ALPHA",
string_format("adamw or sgd optimizer alpha (default: %.2g); note: sgd alpha recommended ~10x (no momentum)", (double) params.lr.lr0),
Expand Down
5 changes: 5 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,11 @@ struct common_params_diffusion {

float cfg_scale = 0; // classifier-free guidance scale
bool add_gumbel_noise = false; // add gumbel noise to the logits if temp > 0.0

float threshold = -1.0f; // confidence threshold for transfer
bool eos_early_stop = false; // enable early EOS termination
bool hybrid_diffusion = false; // enable hybrid diffusion optimization

};

// reasoning API response format (not to be confused as chat template's reasoning format)
Expand Down
7 changes: 7 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8729,6 +8729,13 @@ def prepare_tensors(self):
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")

@ModelBase.register("LLaDA2MoeModelLM")
class LLaDA2MoeModel(BailingMoeV2Model):
model_arch = gguf.MODEL_ARCH.LLADA2

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_diffusion_shift_logits(False)

@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM")
class GroveMoeModel(TextModel):
Expand Down
294 changes: 244 additions & 50 deletions examples/diffusion/diffusion-cli.cpp

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ class MODEL_ARCH(IntEnum):
SMALLTHINKER = auto()
LLADA = auto()
LLADA_MOE = auto()
LLADA2 = auto()
SEED_OSS = auto()
GROVEMOE = auto()
APERTUS = auto()
Expand Down Expand Up @@ -807,6 +808,7 @@ class MODEL_TENSOR(IntEnum):
MODEL_ARCH.SMALLTHINKER: "smallthinker",
MODEL_ARCH.LLADA: "llada",
MODEL_ARCH.LLADA_MOE: "llada-moe",
MODEL_ARCH.LLADA2: "llada2",
MODEL_ARCH.SEED_OSS: "seed_oss",
MODEL_ARCH.GROVEMOE: "grovemoe",
MODEL_ARCH.APERTUS: "apertus",
Expand Down Expand Up @@ -2952,6 +2954,29 @@ class MODEL_TENSOR(IntEnum):
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
],
MODEL_ARCH.LLADA2: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
MODEL_TENSOR.OUTPUT,
MODEL_TENSOR.ATTN_NORM,
MODEL_TENSOR.ATTN_Q_NORM,
MODEL_TENSOR.ATTN_K_NORM,
MODEL_TENSOR.ATTN_QKV,
MODEL_TENSOR.ATTN_OUT,
MODEL_TENSOR.FFN_GATE_INP,
MODEL_TENSOR.FFN_EXP_PROBS_B,
MODEL_TENSOR.FFN_NORM,
MODEL_TENSOR.FFN_GATE,
MODEL_TENSOR.FFN_DOWN,
MODEL_TENSOR.FFN_UP,
MODEL_TENSOR.FFN_GATE_EXP,
MODEL_TENSOR.FFN_DOWN_EXP,
MODEL_TENSOR.FFN_UP_EXP,
MODEL_TENSOR.FFN_GATE_SHEXP,
MODEL_TENSOR.FFN_DOWN_SHEXP,
MODEL_TENSOR.FFN_UP_SHEXP,
MODEL_TENSOR.LAYER_OUT_NORM,
],
MODEL_ARCH.GROVEMOE: [
MODEL_TENSOR.TOKEN_EMBD,
MODEL_TENSOR.OUTPUT_NORM,
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ add_library(llama
models/lfm2.cpp
models/llada-moe.cpp
models/llada.cpp
models/llada2.cpp
models/llama-iswa.cpp
models/llama.cpp
models/mamba.cpp
Expand Down
28 changes: 28 additions & 0 deletions src/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_SMALLTHINKER, "smallthinker" },
{ LLM_ARCH_LLADA, "llada" },
{ LLM_ARCH_LLADA_MOE, "llada-moe" },
{ LLM_ARCH_LLADA2, "llada2" },
{ LLM_ARCH_SEED_OSS, "seed_oss" },
{ LLM_ARCH_GROVEMOE, "grovemoe" },
{ LLM_ARCH_APERTUS, "apertus" },
Expand Down Expand Up @@ -2070,6 +2071,32 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
},
},
{
LLM_ARCH_LLADA2,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
},
},
{
LLM_ARCH_DOTS1,
{
Expand Down Expand Up @@ -2755,6 +2782,7 @@ bool llm_arch_is_diffusion(const llm_arch & arch) {
case LLM_ARCH_DREAM:
case LLM_ARCH_LLADA:
case LLM_ARCH_LLADA_MOE:
case LLM_ARCH_LLADA2:
case LLM_ARCH_RND1:
return true;
default:
Expand Down
1 change: 1 addition & 0 deletions src/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ enum llm_arch {
LLM_ARCH_SMALLTHINKER,
LLM_ARCH_LLADA,
LLM_ARCH_LLADA_MOE,
LLM_ARCH_LLADA2,
LLM_ARCH_SEED_OSS,
LLM_ARCH_GROVEMOE,
LLM_ARCH_APERTUS,
Expand Down
13 changes: 10 additions & 3 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1984,6 +1984,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_LLADA2:
case LLM_ARCH_BAILINGMOE2:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
Expand Down Expand Up @@ -5649,6 +5650,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0);
}
} break;
case LLM_ARCH_LLADA2:
case LLM_ARCH_BAILINGMOE2:
{
const int64_t n_ff_exp = hparams.n_ff_exp;
Expand All @@ -5660,8 +5662,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);

GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2");
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2");
GGML_ASSERT(n_expert > 0 && "n_expert must be > 0 for bailingmoe2/llada2");
GGML_ASSERT(n_expert_used > 0 && "n_expert_used must be > 0 for bailingmoe2/llada2");

for (int i = 0; i < n_layer; ++i) {
int flags = 0;
Expand Down Expand Up @@ -6755,7 +6757,7 @@ void llama_model::print_info() const {
LLAMA_LOG_INFO("%s: expert_weights_norm = %d\n", __func__, hparams.expert_weights_norm);
}

if (arch == LLM_ARCH_BAILINGMOE2) {
if (arch == LLM_ARCH_BAILINGMOE2 || arch == LLM_ARCH_LLADA2) {
LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead);
LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp);
LLAMA_LOG_INFO("%s: n_ff_shexp = %d\n", __func__, hparams.n_ff_shexp);
Expand Down Expand Up @@ -7349,6 +7351,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_bailingmoe2>(*this, params);
} break;
case LLM_ARCH_LLADA2:
{
llm = std::make_unique<llm_build_llada2>(*this, params);
} break;
case LLM_ARCH_SEED_OSS:
{
llm = std::make_unique<llm_build_seed_oss>(*this, params);
Expand Down Expand Up @@ -7652,6 +7658,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_COGVLM:
case LLM_ARCH_PANGU_EMBED:
case LLM_ARCH_AFMOE:
case LLM_ARCH_LLADA2:
return LLAMA_ROPE_TYPE_NEOX;

case LLM_ARCH_QWEN2VL:
Expand Down
132 changes: 132 additions & 0 deletions src/models/llada2.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
#include "models.h"

llm_build_llada2::llm_build_llada2(const llama_model & model, const llm_graph_params & params) :
llm_graph_context(params) {
const int64_t n_embd_head = hparams.n_embd_head_v;
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();

GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);

ggml_tensor * cur;
ggml_tensor * inpL;

inpL = build_inp_embd(model.tok_embd);

// inp_pos - contains the positions
ggml_tensor * inp_pos = build_inp_pos();

auto * inp_attn = build_attn_inp_kv();

ggml_tensor * inp_out_ids = build_inp_out_ids();

for (int il = 0; il < n_layer; ++il) {
ggml_tensor * inpSA = inpL;

// norm
cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "attn_norm", il);

// self_attention
{
cur = build_lora_mm(model.layers[il].wqkv, cur);
cb(cur, "wqkv", il);

ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float),
cur->nb[1], 0 * sizeof(float) * (n_embd));
ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
cur->nb[1], 1 * sizeof(float) * (n_embd));
ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));

Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
cb(Qcur, "Qcur_normed", il);

Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);

Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
cb(Kcur, "Kcur_normed", il);

Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor, beta_fast, beta_slow);

cb(Qcur, "Qcur", il);
cb(Kcur, "Kcur", il);
cb(Vcur, "Vcur", il);

cur = build_attn(inp_attn,
model.layers[il].wo, model.layers[il].bo,
Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
}

if (il == n_layer - 1 && inp_out_ids) {
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
}

ggml_tensor * sa_out = ggml_add(ctx0, cur, inpSA);
cb(sa_out, "sa_out", il);

// MoE branch
cur = build_norm(sa_out, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
cb(cur, "ffn_norm", il);

if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
cur = build_ffn(cur,
model.layers[il].ffn_up, NULL, NULL,
model.layers[il].ffn_gate, NULL, NULL,
model.layers[il].ffn_down, NULL, NULL,
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(cur, "ffn_out", il);
} else {
ggml_tensor * moe_out = build_moe_ffn(cur,
model.layers[il].ffn_gate_inp,
model.layers[il].ffn_up_exps,
model.layers[il].ffn_gate_exps,
model.layers[il].ffn_down_exps,
model.layers[il].ffn_exp_probs_b,
n_expert, n_expert_used,
LLM_FFN_SILU, hparams.expert_weights_norm,
true, hparams.expert_weights_scale,
(llama_expert_gating_func_type) hparams.expert_gating_func,
il);
cb(moe_out, "ffn_moe_out", il);

{
ggml_tensor * ffn_shexp =
build_ffn(cur,
model.layers[il].ffn_up_shexp, NULL, NULL,
model.layers[il].ffn_gate_shexp, NULL, NULL,
model.layers[il].ffn_down_shexp, NULL, NULL,
NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
cb(ffn_shexp, "ffn_shexp", il);

cur = ggml_add(ctx0, moe_out, ffn_shexp);
cb(cur, "ffn_out", il);
}
}

cur = ggml_add(ctx0, cur, sa_out);

cur = build_cvec(cur, il);
cb(cur, "l_out", il);

// input for next layer
inpL = cur;
}

cur = inpL;

cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);

cb(cur, "result_norm", -1);
res->t_embd = cur;

// lm_head
cur = build_lora_mm(model.output, cur);

cb(cur, "result_output", -1);
res->t_logits = cur;

ggml_build_forward_expand(gf, cur);
}
4 changes: 4 additions & 0 deletions src/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,10 @@ struct llm_build_llada : public llm_graph_context {
llm_build_llada(const llama_model & model, const llm_graph_params & params);
};

struct llm_build_llada2 : public llm_graph_context {
llm_build_llada2(const llama_model & model, const llm_graph_params & params);
};

struct llm_build_llada_moe : public llm_graph_context {
llm_build_llada_moe(const llama_model & model, const llm_graph_params & params);
};
Expand Down