Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
6e2d45a
vulkan: fix memory allocations (llama/17122)
0cc4m Nov 9, 2025
ce8d1da
cuda/vulkan : bicubic interpolation (llama/17022)
Acly Nov 10, 2025
5c64359
arm64: add i8mm route with SVE ggml_vec_dot_q4_K_q8_K and ggml_vec_do…
fj-y-saito Nov 10, 2025
4cd5695
metal : enable tensor API for A19 (llama/17087)
ggerganov Nov 10, 2025
a64712e
vulkan: fix validation issue introduced by #16868 (llama/17145)
0cc4m Nov 10, 2025
d1a83fb
vulkan: check glslc executable string (llama/17144)
0cc4m Nov 10, 2025
e4c1e3c
ggml-cpu : inspect -march and -mcpu to found the CPU (llama/16333)
angt Nov 10, 2025
4413a56
metal : cap threadgroups size of set_rows (llama/17146)
ggerganov Nov 10, 2025
becc46e
cpu: skip NOPs to avoid barriers (llama/17133)
max-krasnyansky Nov 10, 2025
485e423
opencl: add fastdiv and use it in set_rows, ported from cuda (llama/1…
lhez Nov 10, 2025
bee7518
cmake : add version to all shared object files (llama/17091)
furrysalamander Nov 11, 2025
2fe28b6
kleidiai: add optimized per-channel kernels for Q8_0 (llama/16993)
chaxu01 Nov 11, 2025
abbb5f2
ggml-cpu: templateify ggml_compute_forward_rope_f32 and _f16 (llama/1…
duduta Nov 11, 2025
f52e7c7
ggml-cpu : add RISC-V RVV (Zvfh) optimization for FP16 to FP32 conver…
ixgbe Nov 11, 2025
c3a1298
disable rms norm mul rope for chips with no fp16 rte (llama/17134)
netrunnereve Nov 11, 2025
32d1b34
hexagon: various Op fixes (llama/17135)
max-krasnyansky Nov 11, 2025
e9df958
fix ci crash about SSM_CONV (llama/17169)
NeoZhangJianyu Nov 12, 2025
2f2c6c3
CANN: Add L2_NORM op support (llama/16856)
TecJesh Nov 12, 2025
6a2c71b
ggml-cpu: handle 3d tensors in repack mat_mul (llama/17030)
Alcpz Nov 12, 2025
a541b0e
ggml : use std::sort in ggml_argsort CPU implementation (llama/17211)
ggerganov Nov 12, 2025
214d1af
CUDA: static assert to prevent misuse of memcpy_1 (llama/17198)
JohannesGaessler Nov 12, 2025
be4d130
CUDA: fuse rope + set_rows (llama/16884)
am17an Nov 13, 2025
c880b43
CANN: Add cross_entropy_loss op support (llama/16886)
TecJesh Nov 13, 2025
9808706
ggml-cpu : use template for argsort (llama/17222)
slaren Nov 13, 2025
b6d0ebe
Revert "ggml-cpu: handle 3d tensors in repack mat_mul (llama/17030)" …
ggerganov Nov 13, 2025
5150c23
metal: accelerated conv2d (llama/17175)
bghira Nov 13, 2025
273dd3f
ggml-cpu : add RISC-V vector intrinsic support for silu and cvar oper…
ixgbe Nov 13, 2025
312480c
sched : fix reserve ignoring user tensor assignments (llama/17232)
slaren Nov 13, 2025
1b4c6ad
vulkan: remove shell call from vulkan-shaders-gen tool, revert file c…
0cc4m Nov 13, 2025
e9b37f5
ggml : add ops SOFTPLUS, EXPM1, TRI, SOLVE_TRI, CUMSUM (llama/17063)
pwilkin Nov 13, 2025
3e4ae29
ggml-cpu: handle 3d tensors in repack mat_mul (llama/17241)
Alcpz Nov 13, 2025
ae08083
metal : make the FA extra sizes consistent (llama/17143)
ggerganov Nov 14, 2025
a6f1d80
metal : support argsort for ne00 > 1024 (llama/17247)
ggerganov Nov 14, 2025
786e005
vulkan: change graph_compute to be async and enable get_tensor_async …
jeffbolznv Nov 15, 2025
9d3fa94
vulkan: skip all-negative-inf blocks in FA (llama/17186)
jeffbolznv Nov 15, 2025
a175f85
vulkan: Use ggml_vk_tensor_subbuffer in mul_mat_vec(id) paths (llama/…
jeffbolznv Nov 15, 2025
89f82bf
vulkan: implement ABS and NEG (llama/17245)
giuseppe Nov 15, 2025
5ae4173
vulkan: Replace 16-bit unpack8 calls to work around legacy Windows AM…
0cc4m Nov 15, 2025
5d9fba0
vulkan: Fuse mul_mat_id+add_id+mul and mul_mat+add+add. (llama/17287)
jeffbolznv Nov 15, 2025
d735614
sycl : unify unary kernels with a generic implementation and enable w…
shani-f Nov 15, 2025
14dac59
opencl: add kernel to handle mat mul in attention to improve encoding…
shaofeiqi Nov 16, 2025
9c2bde0
opencl: fix rms_norm_mul (llama/17250)
lhez Nov 16, 2025
844275a
metal : remove obosolete asserts (llama/17295)
ggerganov Nov 16, 2025
75cfe4a
vulkan: fix MMQ quantize_y condition (llama/17301)
0cc4m Nov 16, 2025
4f694e4
vulkan: add LOG operation support for F32 and F16 (llama/17183)
zayac Nov 16, 2025
7e09095
CANN: Use smart pointers to manage ACL objects (llama/17238)
hipudding Nov 17, 2025
25182a7
metal : add cumsum (llama/17305)
ggerganov Nov 17, 2025
8208359
metal : faster argsort (llama/17315)
ggerganov Nov 17, 2025
714c1ba
metal : support I32 -> I32 copy (llama/17317)
ggerganov Nov 17, 2025
36b80f6
sync : ggml
ggerganov Nov 17, 2025
3e980fd
sync : llama.cpp
ggerganov Nov 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cmake/arm64-apple-clang.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
set( CMAKE_SYSTEM_NAME Darwin )
set( CMAKE_SYSTEM_PROCESSOR arm64 )

set( target arm64-apple-darwin-macho )

set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )

set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )

set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
16 changes: 16 additions & 0 deletions cmake/arm64-windows-llvm.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )

set( target arm64-pc-windows-msvc )

set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )

set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )

set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
29 changes: 29 additions & 0 deletions cmake/riscv64-spacemit-linux-gnu-gcc.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
set(CMAKE_SYSTEM_NAME Linux)
set(CMAKE_SYSTEM_PROCESSOR riscv64)
set(CMAKE_SYSTEM_VERSION 1)

if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "^(riscv)")
message(STATUS "HOST SYSTEM ${CMAKE_HOST_SYSTEM_PROCESSOR}")
else()
set(GNU_MACHINE riscv64-unknown-linux-gnu CACHE STRING "GNU compiler triple")
if (DEFINED ENV{RISCV_ROOT_PATH})
file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
else()
message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
endif()

set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv toolchain")
set(CMAKE_C_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-gcc)
set(CMAKE_CXX_COMPILER ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-g++)
set(CMAKE_STRIP ${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-strip)
set(CMAKE_FIND_ROOT_PATH "${RISCV_ROOT_PATH}/riscv64-unknown-linux-gnu")
set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
endif()

set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY)
set(CMAKE_C_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CMAKE_C_FLAGS}")
set(CMAKE_CXX_FLAGS "-march=rv64gcv_zfh_zba_zicbop -mabi=lp64d ${CXX_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -latomic")
5 changes: 5 additions & 0 deletions cmake/x64-windows-llvm.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR x86_64 )

set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
32 changes: 32 additions & 0 deletions examples/talk-llama/llama-arch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
{ LLM_ARCH_BAILINGMOE2, "bailingmoe2" },
{ LLM_ARCH_DOTS1, "dots1" },
{ LLM_ARCH_ARCEE, "arcee" },
{ LLM_ARCH_AFMOE, "afmoe" },
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
Expand Down Expand Up @@ -333,6 +334,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
},
},
{
LLM_ARCH_AFMOE,
{
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
{ LLM_TENSOR_OUTPUT, "output" },
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
{ LLM_TENSOR_ATTN_GATE, "blk.%d.attn_gate" },
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
},
},
{
LLM_ARCH_LLAMA4,
{
Expand Down Expand Up @@ -2444,6 +2475,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
{LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_ATTN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
{LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
Expand Down
2 changes: 2 additions & 0 deletions examples/talk-llama/llama-arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ enum llm_arch {
LLM_ARCH_BAILINGMOE2,
LLM_ARCH_DOTS1,
LLM_ARCH_ARCEE,
LLM_ARCH_AFMOE,
LLM_ARCH_ERNIE4_5,
LLM_ARCH_ERNIE4_5_MOE,
LLM_ARCH_HUNYUAN_MOE,
Expand Down Expand Up @@ -312,6 +313,7 @@ enum llm_tensor {
LLM_TENSOR_ATTN_POST_NORM,
LLM_TENSOR_ATTN_ROT_EMBD,
LLM_TENSOR_ATTN_SINKS,
LLM_TENSOR_ATTN_GATE,
LLM_TENSOR_FFN_GATE_INP,
LLM_TENSOR_FFN_GATE_INP_SHEXP,
LLM_TENSOR_FFN_NORM,
Expand Down
3 changes: 2 additions & 1 deletion examples/talk-llama/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1592,9 +1592,10 @@ ggml_tensor * llm_graph_context::build_attn(
int il) const {
// these nodes are added to the graph together so that they are not reordered
// by doing so, the number of splits in the graph is reduced
// expand k later to enable rope fusion which directly writes into k-v cache
ggml_build_forward_expand(gf, q_cur);
ggml_build_forward_expand(gf, k_cur);
ggml_build_forward_expand(gf, v_cur);
ggml_build_forward_expand(gf, k_cur);

const auto * mctx_cur = inp->mctx;

Expand Down
7 changes: 4 additions & 3 deletions examples/talk-llama/llama-memory-recurrent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
p1 = std::numeric_limits<llama_pos>::max();
}

// models like Mamba or RWKV can't have a state partially erased
// models like Mamba or RWKV can't have a state partially erased at the end
// of the sequence because their state isn't preserved for previous tokens
if (seq_id >= (int64_t) size) {
// could be fatal
return false;
Expand All @@ -160,8 +161,8 @@ bool llama_memory_recurrent::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos
int32_t & tail_id = cells[seq_id].tail;
if (tail_id >= 0) {
const auto & cell = cells[tail_id];
// partial intersection is invalid
if ((0 < p0 && p0 < cell.pos) || (0 < p1 && p1 <= cell.pos)) {
// partial intersection is invalid if it includes the final pos
if (0 < p0 && p0 <= cell.pos && p1 > cell.pos) {
//printf("[DEBUG] inside `llama_memory_recurrent::seq_rm`: partial intersection is invalid, so returning false\n");
return false;
}
Expand Down
102 changes: 102 additions & 0 deletions examples/talk-llama/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ const char * llm_type_name(llm_type type) {
case LLM_TYPE_15B: return "15B";
case LLM_TYPE_16B: return "16B";
case LLM_TYPE_20B: return "20B";
case LLM_TYPE_26B: return "26B";
case LLM_TYPE_27B: return "27B";
case LLM_TYPE_30B: return "30B";
case LLM_TYPE_32B: return "32B";
Expand Down Expand Up @@ -695,6 +696,37 @@ void llama_model::load_hparams(llama_model_loader & ml) {
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_AFMOE:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false);
ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false);
ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false);

// Set up interleaved sliding window attention (ISWA)
// Pattern: 3 sliding - 1 full (global_attn_every_n_layers = 4)
if (hparams.n_swa > 0) {
hparams.swa_type = LLAMA_SWA_TYPE_STANDARD;
hparams.set_swa_pattern(4);
} else {
hparams.swa_type = LLAMA_SWA_TYPE_NONE;
}

// Default to sigmoid if not set
if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID;
}

switch (hparams.n_layer) {
case 56: type = LLM_TYPE_6B; break;
case 32: type = LLM_TYPE_26B; break;
default: type = LLM_TYPE_UNKNOWN;
}
} break;
case LLM_ARCH_DECI:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
Expand Down Expand Up @@ -5749,6 +5781,71 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
} break;
case LLM_ARCH_AFMOE:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);

// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);

// if output is NULL, init from the input tok embed
if (output == NULL) {
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
}

const int64_t n_ff_exp = hparams.n_ff_exp;
const int64_t n_expert_shared = hparams.n_expert_shared;

for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];

// dual attention normalization
layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);

// attention projections
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0);
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0);
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0);

// Q/K normalization
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0);
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0);

// attention gating
layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_embd_head_k * n_head}, 0);

// dual ffn normalization
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);

if (static_cast<uint32_t>(i) >= hparams.n_layer_dense_lead) {
// MoE layers
layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0);
layer.ffn_exp_probs_b = create_tensor(tn(LLM_TENSOR_FFN_EXP_PROBS_B, "bias", i), {n_expert}, 0);

// grouped expert weights
layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);
layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0);
layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff_exp, n_expert}, 0);

// shared expert
if (n_expert_shared > 0) {
const int64_t n_ff_shexp = n_ff_exp * n_expert_shared;
layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {n_ff_shexp, n_embd}, 0);
layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_shexp}, 0);
}
} else {
// Dense layers
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
}
}
} break;
case LLM_ARCH_ERNIE4_5:
case LLM_ARCH_ERNIE4_5_MOE:
{
Expand Down Expand Up @@ -7243,6 +7340,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
{
llm = std::make_unique<llm_build_arcee>(*this, params);
} break;
case LLM_ARCH_AFMOE:
{
llm = std::make_unique<llm_build_afmoe>(*this, params);
} break;
case LLM_ARCH_ERNIE4_5:
{
llm = std::make_unique<llm_build_ernie4_5>(*this, params);
Expand Down Expand Up @@ -7528,6 +7629,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
case LLM_ARCH_MINIMAX_M2:
case LLM_ARCH_COGVLM:
case LLM_ARCH_PANGU_EMBED:
case LLM_ARCH_AFMOE:
return LLAMA_ROPE_TYPE_NEOX;

case LLM_ARCH_QWEN2VL:
Expand Down
2 changes: 2 additions & 0 deletions examples/talk-llama/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ enum llm_type {
LLM_TYPE_15B,
LLM_TYPE_16B,
LLM_TYPE_20B,
LLM_TYPE_26B,
LLM_TYPE_27B,
LLM_TYPE_30B,
LLM_TYPE_32B,
Expand Down Expand Up @@ -234,6 +235,7 @@ struct llama_layer {
struct ggml_tensor * wk_enc = nullptr;
struct ggml_tensor * wv_enc = nullptr;
struct ggml_tensor * wo_enc = nullptr;
struct ggml_tensor * wqkv_gate = nullptr;

// attention bias
struct ggml_tensor * bq = nullptr;
Expand Down
15 changes: 10 additions & 5 deletions examples/talk-llama/llama-sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "llama-vocab.h"
#include "llama-grammar.h"

#include <array>
#include <algorithm>
#include <cassert>
#include <cfloat>
Expand Down Expand Up @@ -1625,10 +1626,12 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
auto * ctx = new llama_sampler_grammar;

if (grammar_str != nullptr && grammar_str[0] != '\0') {
std::string trigger_pattern;
llama_grammar * grammar = nullptr;
// TODO: remove trigger_words support.
if (trigger_words != nullptr && num_trigger_words > 0) {
GGML_ASSERT(trigger_patterns == nullptr && num_trigger_patterns == 0);
std::string trigger_pattern("[\\s\\S]*?(");
trigger_pattern = "[\\s\\S]*?(";
for (size_t i = 0; i < num_trigger_words; ++i) {
static const std::regex special_chars("[.^$|()*+?\\[\\]{}\\\\]");
if (i > 0) {
Expand All @@ -1637,15 +1640,17 @@ static struct llama_sampler * llama_sampler_init_grammar_impl(
trigger_pattern += std::regex_replace(trigger_words[i], special_chars, "\\$0");
}
trigger_pattern += ")[\\s\\S]*";
const auto * trigger_pattern_c = trigger_pattern.c_str();
trigger_patterns = &trigger_pattern_c;
num_trigger_patterns = 1;

std::array<const char *, 1> tmp_trigger_patterns = { trigger_pattern.c_str() };
grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, tmp_trigger_patterns.data(), tmp_trigger_patterns.size(), trigger_tokens, num_trigger_tokens);
} else {
grammar = llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens);
}
*ctx = {
/* .vocab = */ vocab,
/* .grammar_str = */ grammar_str,
/* .grammar_root = */ grammar_root,
/* .grammar = */ llama_grammar_init_impl(vocab, grammar_str, grammar_root, lazy, trigger_patterns, num_trigger_patterns, trigger_tokens, num_trigger_tokens),
/* .grammar = */ grammar,
};
if (!ctx->grammar) {
delete ctx;
Expand Down
Loading
Loading