From 61cfad02078894d84601207c4739535e83c32c2b Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Tue, 8 Jul 2025 16:35:49 -0400 Subject: [PATCH 01/32] support for smoldocling --- convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/tensor_mapping.py | 17 ++++++++++++++++- include/llama.h | 1 + src/llama-model.cpp | 2 ++ src/llama-model.h | 1 + src/llama-vocab.cpp | 13 +++++++++++++ 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 2f733f0973686..1c3748297c382 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, + {"name": "smoldocling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b30f77dbe3be7..c769ea664b036 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -32,6 +32,7 @@ class TensorNameMap: "model.word_embeddings", # bailingmoe "language_model.model.embed_tokens", # llama4 "encoder", # neobert + "model.text_model.embed_tokens.weight", # smoldocling ), # Token type embeddings @@ -63,7 +64,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe - "output", # llama-pth bloom internlm2 + "output", # llama-pth bloom internlm2 smoldocling "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 "output_layer", # chatglm @@ -93,6 +94,7 @@ class TensorNameMap: "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 + "output_norm", # smoldocling ), # Rope frequencies @@ -136,6 +138,7 @@ class TensorNameMap: "model.layers.{bid}.ln1", # rwkv7 "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert + "blk.{bid}.attn_norm", # smoldocling ), # Attention norm 2 @@ -179,6 +182,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone "model.layers.{bid}.self_attn.q_proj", # llama4 + "blk.{bid}.attn_q", # smoldocling ), # Attention key @@ -195,6 +199,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone "model.layers.{bid}.self_attn.k_proj", # llama4 + "blk.{bid}.attn_k", # smoldocling ), # Attention value @@ -210,6 +215,8 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone "model.layers.{bid}.self_attn.v_proj", # llama4 + "blk.{bid}.attn_v", # smoldocling + ), # Attention output @@ -240,6 +247,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.attention.out_proj", # exaone "model.layers.{bid}.self_attn.o_proj", # llama4 "transformer_encoder.{bid}.wo", # neobert + "blk.{bid}.attn_output", # smoldocling ), # Attention output norm @@ -249,6 +257,7 @@ class TensorNameMap: "encoder.layers.{bid}.norm1", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx + "blk.{bid}.attn_norm", # smoldocling ), MODEL_TENSOR.ATTN_POST_NORM: ( @@ -281,6 +290,7 @@ class TensorNameMap: "transformer.layers.{bid}.ffn_norm", # openelm "model.layers.{bid}.post_attention_layernorm", # llama4 "transformer_encoder.{bid}.ffn_norm", # neobert + "blk.{bid}.ffn_norm", # smoldocling ), # Post feed-forward norm @@ -346,6 +356,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc_1", # exaone "model.layers.{bid}.feed_forward.up_proj", # llama4 "transformer_encoder.{bid}.ffn.w12", # neobert + "blk.{bid}.ffn_up", # smoldocling ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -383,6 +394,8 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 + "blk.{bid}.ffn_gate", # smoldocling + ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -429,6 +442,8 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 "transformer_encoder.{bid}.ffn.w3", # neobert + "blk.{bid}.ffn_down", # smoldocling + ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..d920732622b81 100644 --- a/include/llama.h +++ b/include/llama.h @@ -117,6 +117,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, + LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 36, }; enum llama_rope_type { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fc39195ed5177..a2aa7856a169e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_190M: return "190M"; case LLM_TYPE_220M: return "220M"; case LLM_TYPE_250M: return "250M"; + case LLM_TYPE_256M: return "256M"; case LLM_TYPE_270M: return "270M"; case LLM_TYPE_335M: return "335M"; case LLM_TYPE_410M: return "410M"; @@ -575,6 +576,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 22: type = LLM_TYPE_1B; break; case 26: type = LLM_TYPE_3B; break; case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B + case 30: type = LLM_TYPE_256M; break; // smoldocling 256M // granite uses a vocab with len 49152 case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break; case 36: type = LLM_TYPE_8B; break; // granite diff --git a/src/llama-model.h b/src/llama-model.h index 40063b790d434..9258e7cf65f2b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -32,6 +32,7 @@ enum llm_type { LLM_TYPE_190M, LLM_TYPE_220M, LLM_TYPE_250M, + LLM_TYPE_256M, // smoldocling LLM_TYPE_270M, LLM_TYPE_335M, LLM_TYPE_410M, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 5c9eb87566dde..995e87da607ce 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -424,6 +424,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING: + // uses digits and byte level pre tokenizers defined in the pre_tokenizer section of + // https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json + regex_exprs = { + "[0-9]", + "[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+", + }; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1656,6 +1663,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "seed-coder") { pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; + } else if ( + tokenizer_pre == "smoldocling") { + pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -1839,6 +1850,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "_" || t.first == "<|end▁of▁sentence|>" // DeepSeek + || t.first == "" // smoldocling ) { special_eot_id = t.second; if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { @@ -1998,6 +2010,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "_" || t.first == "<|end_of_text|>" + || t.first == "" // smoldocling ) { special_eog_ids.insert(t.second); if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { From 8c18417af549c1eba39139214b1c901e4e701a63 Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Tue, 8 Jul 2025 16:47:52 -0400 Subject: [PATCH 02/32] fixed merge conflicts --- include/llama.h | 3 ++- src/llama-vocab.cpp | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index d920732622b81..045453e0e4107 100644 --- a/include/llama.h +++ b/include/llama.h @@ -117,7 +117,8 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, - LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 36, + LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, + LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 37, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 995e87da607ce..dc247383fd87c 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -351,6 +351,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { break; case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_QWEN2: + case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: regex_exprs = { // original regex from tokenizer.json // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" @@ -1663,6 +1664,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "seed-coder") { pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; + } else if ( + tokenizer_pre == "hunyuan") { + pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; + clean_spaces = false; } else if ( tokenizer_pre == "smoldocling") { pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; From c36703a2a3937c4a13bad30f37dcc437bddf5de3 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:10:41 -0400 Subject: [PATCH 03/32] Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Gabe Goodhart --- gguf-py/gguf/tensor_mapping.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 88849b7ed6d26..0a2473ef6d045 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -399,8 +399,7 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 - "blk.{bid}.ffn_gate", # smoldocling - + "blk.{bid}.ffn_gate", # smoldocling ), MODEL_TENSOR.FFN_GATE_EXP: ( From 6ea3b0bf78c7f4c4afd9d3bb549f5d048b1a8223 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:10:47 -0400 Subject: [PATCH 04/32] Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Gabe Goodhart --- gguf-py/gguf/tensor_mapping.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0a2473ef6d045..7d0a49104f41b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -447,8 +447,7 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 "transformer_encoder.{bid}.ffn.w3", # neobert - "blk.{bid}.ffn_down", # smoldocling - + "blk.{bid}.ffn_down", # smoldocling ), MODEL_TENSOR.FFN_DOWN_EXP: ( From 7e16cb8ad3035e3a24c1fde6bf223d935f7211ac Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Wed, 9 Jul 2025 12:27:23 -0400 Subject: [PATCH 05/32] merge conflicts --- convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/tensor_mapping.py | 17 ++++++++++++++++- include/llama.h | 1 + src/llama-model.cpp | 2 ++ src/llama-model.h | 1 + src/llama-vocab.cpp | 14 ++++++++++++++ 6 files changed, 35 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 96a2b692a86c1..4b2efd24fb0f4 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, + {"name": "smoldocling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7c2877f56c644..8a08ff7f77a3a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -32,6 +32,7 @@ class TensorNameMap: "model.word_embeddings", # bailingmoe "language_model.model.embed_tokens", # llama4 "encoder", # neobert + "model.text_model.embed_tokens.weight", # smoldocling ), # Token type embeddings @@ -63,7 +64,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe - "output", # llama-pth bloom internlm2 + "output", # llama-pth bloom internlm2 smoldocling "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 "output_layer", # chatglm @@ -93,6 +94,7 @@ class TensorNameMap: "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 + "output_norm", # smoldocling ), # Rope frequencies @@ -136,6 +138,7 @@ class TensorNameMap: "model.layers.{bid}.ln1", # rwkv7 "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert + "blk.{bid}.attn_norm", # smoldocling ), # Attention norm 2 @@ -179,6 +182,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone "model.layers.{bid}.self_attn.q_proj", # llama4 + "blk.{bid}.attn_q", # smoldocling ), # Attention key @@ -195,6 +199,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone "model.layers.{bid}.self_attn.k_proj", # llama4 + "blk.{bid}.attn_k", # smoldocling ), # Attention value @@ -210,6 +215,8 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone "model.layers.{bid}.self_attn.v_proj", # llama4 + "blk.{bid}.attn_v", # smoldocling + ), # Attention output @@ -240,6 +247,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.attention.out_proj", # exaone "model.layers.{bid}.self_attn.o_proj", # llama4 "transformer_encoder.{bid}.wo", # neobert + "blk.{bid}.attn_output", # smoldocling ), # Attention output norm @@ -249,6 +257,7 @@ class TensorNameMap: "encoder.layers.{bid}.norm1", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx + "blk.{bid}.attn_norm", # smoldocling ), MODEL_TENSOR.ATTN_POST_NORM: ( @@ -281,6 +290,7 @@ class TensorNameMap: "transformer.layers.{bid}.ffn_norm", # openelm "model.layers.{bid}.post_attention_layernorm", # llama4 "transformer_encoder.{bid}.ffn_norm", # neobert + "blk.{bid}.ffn_norm", # smoldocling ), # Post feed-forward norm @@ -347,6 +357,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc_1", # exaone "model.layers.{bid}.feed_forward.up_proj", # llama4 "transformer_encoder.{bid}.ffn.w12", # neobert + "blk.{bid}.ffn_up", # smoldocling ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -385,6 +396,8 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 + "blk.{bid}.ffn_gate", # smoldocling + ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -432,6 +445,8 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 "transformer_encoder.{bid}.ffn.w3", # neobert + "blk.{bid}.ffn_down", # smoldocling + ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/include/llama.h b/include/llama.h index dc86aea41dcbd..045453e0e4107 100644 --- a/include/llama.h +++ b/include/llama.h @@ -118,6 +118,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, + LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 37, }; enum llama_rope_type { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fc4e9a5af004d..f93efe38131c3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_190M: return "190M"; case LLM_TYPE_220M: return "220M"; case LLM_TYPE_250M: return "250M"; + case LLM_TYPE_256M: return "256M"; case LLM_TYPE_270M: return "270M"; case LLM_TYPE_335M: return "335M"; case LLM_TYPE_410M: return "410M"; @@ -581,6 +582,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 22: type = LLM_TYPE_1B; break; case 26: type = LLM_TYPE_3B; break; case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B + case 30: type = LLM_TYPE_256M; break; // smoldocling 256M // granite uses a vocab with len 49152 case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break; case 36: type = LLM_TYPE_8B; break; // granite diff --git a/src/llama-model.h b/src/llama-model.h index 70a6dc89e1b06..541997a121516 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -32,6 +32,7 @@ enum llm_type { LLM_TYPE_190M, LLM_TYPE_220M, LLM_TYPE_250M, + LLM_TYPE_256M, // smoldocling LLM_TYPE_270M, LLM_TYPE_335M, LLM_TYPE_410M, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 551bba171c0e0..69c26f5f9fecd 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -425,6 +425,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING: + // uses digits and byte level pre tokenizers defined in the pre_tokenizer section of + // https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json + regex_exprs = { + "[0-9]", + "[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+", + }; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1658,8 +1665,13 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; } else if ( +<<<<<<< HEAD tokenizer_pre == "hunyuan") { pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; +======= + tokenizer_pre == "smoldocling") { + pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; +>>>>>>> 61cfad02 (support for smoldocling) clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); @@ -1844,6 +1856,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "_" || t.first == "<|end▁of▁sentence|>" // DeepSeek + || t.first == "" // smoldocling ) { special_eot_id = t.second; if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { @@ -2003,6 +2016,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "_" || t.first == "<|end_of_text|>" + || t.first == "" // smoldocling ) { special_eog_ids.insert(t.second); if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { From 5050f278f527d2b19a3e772d5b4cd171de0e042c Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Wed, 9 Jul 2025 14:53:04 -0400 Subject: [PATCH 06/32] pre tokenizer merge fix --- src/llama-vocab.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 4f16fc6809016..aec702382b966 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1667,14 +1667,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; } else if ( -<<<<<<< HEAD tokenizer_pre == "hunyuan") { pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; -======= - tokenizer_pre == "smoldocling") { - pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; ->>>>>>> 61cfad02 (support for smoldocling) - clean_spaces = false; } else if ( tokenizer_pre == "smoldocling") { pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; From fbfcdaecfc458b5a7aba6c0c594f70a731e4d55d Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 9 Jul 2025 08:26:13 +0200 Subject: [PATCH 07/32] convert : fix smollm3 jinja template (#14586) Signed-off-by: ryan-mangeno --- convert_hf_to_gguf.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 3f3dfb416c1fc..5d12d4799fafc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6692,6 +6692,16 @@ def prepare_tensors(self): class SmolLM3Model(LlamaModel): model_arch = gguf.MODEL_ARCH.SMOLLM3 + def set_vocab(self): + super().set_vocab() + # remove unsupported array slicing in chat template + # ref: https://huggingface.co/ggml-org/SmolLM3-3B-GGUF/discussions/1 + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + if tokenizer.chat_template is not None: + chat_template = tokenizer.chat_template.replace("[:]", "") + self.gguf_writer.add_chat_template(chat_template) + ###### CONVERSION LOGIC ###### From 43942e626edbbd1d30f1c17e8dfdb8060e42f1be Mon Sep 17 00:00:00 2001 From: ibrahim khadraoui <132432132+ibrahimkhadraoui@users.noreply.github.com> Date: Wed, 9 Jul 2025 12:03:49 +0400 Subject: [PATCH 08/32] model : add support for Falcon-H1 family (#14534) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * v1 * push more fixes * another fix * fix * more fixes * minor fix * more cleaning on python code * python fixes * changed precision for multipliers float 32->64 * fixes * another fix * fix * pre-norm -> norm * fix * Revert "fix" This reverts commit 243e4d1a50bd73467d99f6b289b9a1826f83b94b. * fix * small fix ffn_norm * try * mix instead of max * fix vocab size * conflict solve * fixed multipliers * falcon-h1 specefic vocab resolved * read arch from gguf.MODEL_ARCH * mamba_d_ssm added to d_inner find_hparam * remove unused functions from gguf_writer.py * override modify_tensors instead of get_tensors * fix conversion and d_inner * added some cb functions for debugging puposes * inp_out_ids moved outside of layers loop * mup_vec create as float64 * fix rope_theta * injected mup * clean ups * rm extra space * rm unused MAMBA_CHUNK_SIZE * rm unused key * add bos False * changed ROPE_TYPE * cleaning debugging stuff * cleaning debug quant * fix comment * some cleanups * some cleanups * Update src/llama-model-loader.cpp * more cleanups * moe cleanuips * d_ssm -> d_inner; * cleaning unused hparams * cleanup * more cleanups * more cleanups on python conversion; * minor cleanups * Apply suggestions from code review Co-authored-by: Georgi Gerganov * remove todo * added falcon-h1 * tensor not required * clean * remove unneeded attributes * more cleanups and fixed conversion * remove final_norm * flake8 fixes * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * flake8 fixes * Update src/llama-hparams.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * Update src/llama-arch.cpp Co-authored-by: Sigbjørn Skjæret * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret * added hashes * Update src/llama-arch.cpp Co-authored-by: Georgi Gerganov * Update src/llama-vocab.cpp Co-authored-by: Georgi Gerganov * update the update file * Revert "update the update file" This reverts commit 082ab4ad2a3927384d878666a5f8cae4eb15f577. * fix: address suggestions * fix: update convert_hf_to_gguf.py * Update gguf-py/gguf/constants.py Co-authored-by: Sigbjørn Skjæret * Update src/llama-model-loader.cpp Co-authored-by: Sigbjørn Skjæret * d_inner fixed * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * reshaping ssm_norm for 34B * removing generate_mup * remove duplicates metadata keys * rm comment * final comment * fix unused args * fix constants * fix bad merge * Update src/llama-model.cpp Co-authored-by: compilade * falcon-h1: remove unused ssm_in_b and bad merge * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret * falcon-h1: fix last comment * Update convert_hf_to_gguf.py Co-authored-by: compilade * falcon-h1: revert add_add_bos(False) * falcon-h1: fix tied weights * falcon-h1: remove whitespace * falcon-h1: fix wrong size param * falcon-h1: fix whitespace issues --------- Co-authored-by: younesbelkada Co-authored-by: Younes B <49240599+younesbelkada@users.noreply.github.com> Co-authored-by: Georgi Gerganov Co-authored-by: Sigbjørn Skjæret Co-authored-by: compilade Signed-off-by: ryan-mangeno --- convert_hf_to_gguf.py | 131 +++++++++++- convert_hf_to_gguf_update.py | 5 + gguf-py/gguf/constants.py | 36 ++++ gguf-py/gguf/tensor_mapping.py | 10 + src/llama-arch.cpp | 30 ++- src/llama-arch.h | 1 + src/llama-model.cpp | 380 ++++++++++++++++++++++++++++++++- src/llama-vocab.cpp | 1 + 8 files changed, 585 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5d12d4799fafc..4dedc020b61e1 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -818,6 +818,18 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664": # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct res = "hunyuan" + if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6": + # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base + res = "falcon-h1" + if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": + # ref: https://huggingface.co/tiiuae/Falcon-H1-1B-Base + res = "falcon-h1" + if chkhsh == "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896": + # ref: https://huggingface.co/tiiuae/Falcon-H1-7B-Base + res = "falcon-h1" + if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": + # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base + res = "falcon-h1" if res is None: logger.warning("\n") @@ -4899,17 +4911,19 @@ def set_vocab(self): def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 128 - head_dim = self.find_hparam(["head_dim"], optional=True) or 64 + head_dim = self.find_hparam(["mamba_d_head", "head_dim"], optional=True) or 64 n_group = self.find_hparam(["n_groups"], optional=True) or 1 rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 # Fail early for models which don't have a block expansion factor of 2 # TODO: does this really matter? - assert d_inner == 2 * d_model - assert d_inner % head_dim == 0 + # skip the assertion for FalconH1 Model + if self.model_arch != gguf.MODEL_ARCH.FALCON_H1: + assert d_inner == 2 * d_model + assert d_inner % head_dim == 0 self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) @@ -4946,7 +4960,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter data_torch = data_torch.reshape((*data_torch.shape, 1)) elif self.match_model_tensor_name(new_name, gguf.MODEL_TENSOR.SSM_NORM, bid): d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) - d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model + d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * d_model n_group = self.hparams.get("n_groups", 1) data_torch = data_torch.reshape((n_group, d_inner // n_group)) @@ -6539,6 +6553,113 @@ def set_gguf_parameters(self): self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) +@ModelBase.register("FalconH1ForCausalLM") +class FalconH1Model(Mamba2Model): + model_arch = gguf.MODEL_ARCH.FALCON_H1 + + def __init__(self, *args, **kwargs): + # Set the hparam prefixes for Falcon Mamba2 + self.hparam_prefixes = ["mamba"] + + # Initialize the base Mamba2Model + super().__init__(*args, **kwargs) + + # Use Llama conversion for attention + self._transformer_model_class = LlamaModel + + # n_group and d_inner are used during reshape_tensors for mamaba2 + self.n_group = self.find_hparam(["n_groups"]) + self.d_inner = self.find_hparam(["mamba_d_ssm"]) + self.d_head = self.find_hparam(["d_head"]) + + # Initialize any Falcon Mamba2 specific attributes + self.has_attention = True # Falcon Mamba2 has attention components + + # Load Falcon-H1 multipliers from hyperparameters + self.attention_in_multiplier = self.find_hparam(["attention_in_multiplier"], optional=True) + self.attention_out_multiplier = self.find_hparam(["attention_out_multiplier"], optional=True) + self.ssm_in_multiplier = self.find_hparam(["ssm_in_multiplier"], optional=True) + self.ssm_out_multiplier = self.find_hparam(["ssm_out_multiplier"], optional=True) + self.mlp_multipliers = self.find_hparam(["mlp_multipliers"], optional=True) + self.ssm_multipliers = self.find_hparam(["ssm_multipliers"], optional=True) + self.intermediate_size = self.find_hparam(["intermediate_size"]) + self.key_multiplier = self.find_hparam(["key_multiplier"], optional=True) + + def find_hparam(self, keys: Iterable[str], *args, **kwargs) -> Any: + prefixed = [] + for pfx in self.hparam_prefixes: + prefixed.extend( + "_".join([pfx, k]) + for k in keys + ) + keys = list(keys) + prefixed + return super().find_hparam(keys, *args, **kwargs) + + def set_vocab(self): + self._set_vocab_gpt2() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + tensors = list(super().modify_tensors(data_torch, name, bid)) + tensor = tensors[0][1] + + if "down_proj" in name: + tensor = tensor * self.mlp_multipliers[1] + elif "gate_proj" in name: + tensor = tensor * self.mlp_multipliers[0] + elif "k_proj" in name: + tensor = tensor * self.key_multiplier * self.attention_in_multiplier + elif "q_proj" in name: + tensor = tensor * self.attention_in_multiplier + elif "v_proj" in name: + tensor = tensor * self.attention_in_multiplier + elif "o_proj" in name: + tensor = tensor * self.attention_out_multiplier + elif "out_proj" in name: + tensor = tensor * self.ssm_out_multiplier + elif "in_proj" in name: + tensor = tensor * self.ssm_in_multiplier + zxbcdt_multipliers = self.hparams["ssm_multipliers"] + intermediate_size = self.hparams["mamba_d_ssm"] + groups_time_state_size = self.hparams["mamba_n_groups"] * self.hparams["mamba_d_state"] + tensor[:intermediate_size, :] *= zxbcdt_multipliers[0] + tensor[intermediate_size:2 * intermediate_size, :] *= zxbcdt_multipliers[1] + tensor[2 * intermediate_size:2 * intermediate_size + groups_time_state_size, :] *= zxbcdt_multipliers[2] + tensor[2 * intermediate_size + groups_time_state_size:2 * intermediate_size + 2 * groups_time_state_size, :] *= zxbcdt_multipliers[3] + tensor[2 * intermediate_size + 2 * groups_time_state_size:, :] *= zxbcdt_multipliers[4] + elif "lm_head" in name: + tensor = tensor * self.hparams["lm_head_multiplier"] + elif "embed_tokens" in name: + tensor = tensor * self.hparams["embedding_multiplier"] + elif "mamba.norm" in name: + tensor = tensor.reshape(self.n_group, self.d_inner // self.n_group) + + tensors = [(tensors[0][0], tensor)] + return tensors + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + ## General Params ## + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + # Override some Mamba2 defaults + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams.get("max_position_embeddings", 0)) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + + ## Attention params ## + self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) # Override value 0 from Mamba2 + self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) + self.gguf_writer.add_key_length(self.hparams["head_dim"]) + self.gguf_writer.add_value_length(self.hparams["head_dim"]) + + ## Validation ## + assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" + assert self.d_inner % self.d_head == 0, f"SSM inner size {self.d_inner} not a multiple of head dim {self.d_head}" + + # Add any other Falcon Mamba2 specific configuration + self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) + + @ModelBase.register("HunYuanMoEV1ForCausalLM") class HunYuanMoEModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_MOE diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 96a2b692a86c1..15a326e695dd5 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -138,6 +138,11 @@ class TOKENIZER_TYPE(IntEnum): {"name": "glm4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", "chkhsh": "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2"}, {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", "chkhsh": "1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35"}, {"name": "hunyuan", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tencent/Hunyuan-A13B-Instruct", "chkhsh": "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664"}, + # falcon-h1 series uses 4 different tokenizers across model sizes (0.5b - 34b), hence we need to define 4 different hashes + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base", "chkhsh": "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6"}, + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-1B-Base", "chkhsh": "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86"}, + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-7B-Base", "chkhsh": "3eda48b4c4dc7de733d1a8b3e3b4a85243dbbf704da2ee9d42c6beced8897896"}, + {"name": "falcon-h1", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon-H1-34B-Base", "chkhsh": "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b"}, ] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e938f8fa664df..93eec43556c74 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -288,6 +288,7 @@ class MODEL_ARCH(IntEnum): LLAMA4 = auto() DECI = auto() FALCON = auto() + FALCON_H1 = auto() BAICHUAN = auto() GROK = auto() GPT2 = auto() @@ -662,6 +663,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DOTS1: "dots1", MODEL_ARCH.ARCEE: "arcee", MODEL_ARCH.ERNIE4_5: "ernie4_5", + MODEL_ARCH.FALCON_H1: "falcon-h1", MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", MODEL_ARCH.SMOLLM3: "smollm3", } @@ -2215,6 +2217,40 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.FALCON_H1: [ + # Token embedding + MODEL_TENSOR.TOKEN_EMBD, + + # Input layernorm + MODEL_TENSOR.ATTN_NORM, + + # Attention components + MODEL_TENSOR.ATTN_Q, # Query projection + MODEL_TENSOR.ATTN_K, # Key projection + MODEL_TENSOR.ATTN_V, # Value projection + MODEL_TENSOR.ATTN_OUT, # Output projection + + # SSM components (Mamba2 specific) + MODEL_TENSOR.SSM_IN, # Input projection for SSM + MODEL_TENSOR.SSM_CONV1D, # Convolution layer + MODEL_TENSOR.SSM_DT, # Delta time projection + MODEL_TENSOR.SSM_A, # A parameter (log form) + MODEL_TENSOR.SSM_D, # D parameter + MODEL_TENSOR.SSM_NORM, # Normalization in SSM + MODEL_TENSOR.SSM_OUT, # Output projection + + # Pre-feedforward layernorm + MODEL_TENSOR.FFN_PRE_NORM, + + # Feed-forward network components + MODEL_TENSOR.FFN_GATE, # Gate projection (SwiGLU) + MODEL_TENSOR.FFN_DOWN, # Down projection + MODEL_TENSOR.FFN_UP, # Up projection + + # Post-feedforward layernorm + MODEL_TENSOR.OUTPUT_NORM, # Final layer norm + MODEL_TENSOR.OUTPUT, # Output projection (lm_head) + ], MODEL_ARCH.HUNYUAN_MOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7c2877f56c644..6bddbec23d74a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -286,12 +286,14 @@ class TensorNameMap: # Post feed-forward norm MODEL_TENSOR.FFN_PRE_NORM: ( "model.layers.{bid}.pre_feedforward_layernorm", # gemma2 + "model.layers.{bid}.pre_ff_layernorm.weight", ), # Post feed-forward norm MODEL_TENSOR.FFN_POST_NORM: ( "model.layers.{bid}.post_feedforward_layernorm", # gemma2 olmo2 "model.layers.{bid}.post_mlp_layernorm", # glm-4-0414 + "model.layers.{bid}.feed_forward.up_proj", ), MODEL_TENSOR.FFN_GATE_INP: ( @@ -363,6 +365,7 @@ class TensorNameMap: "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 "model.layers.{bid}.feed_forward.shared_expert.up_proj", # llama4 + "model.layers.{bid}.feed_forward.down_proj", "model.layers.{bid}.mlp.shared_mlp.up_proj", # hunyuan ), @@ -553,11 +556,13 @@ class TensorNameMap: MODEL_TENSOR.SSM_IN: ( "model.layers.{bid}.in_proj", "backbone.layers.{bid}.mixer.in_proj", + "model.layers.{bid}.mamba.in_proj", ), MODEL_TENSOR.SSM_CONV1D: ( "model.layers.{bid}.conv1d", "backbone.layers.{bid}.mixer.conv1d", + "model.layers.{bid}.mamba.conv1d", ), MODEL_TENSOR.SSM_X: ( @@ -568,25 +573,30 @@ class TensorNameMap: MODEL_TENSOR.SSM_DT: ( "model.layers.{bid}.dt_proj", "backbone.layers.{bid}.mixer.dt_proj", + "model.layers.{bid}.mamba.dt_proj", ), MODEL_TENSOR.SSM_A: ( "model.layers.{bid}.A_log", "backbone.layers.{bid}.mixer.A_log", + "model.layers.{bid}.mamba.A_log", ), MODEL_TENSOR.SSM_D: ( "model.layers.{bid}.D", "backbone.layers.{bid}.mixer.D", + "model.layers.{bid}.mamba.D", ), MODEL_TENSOR.SSM_NORM: ( + "model.layers.{bid}.mamba.norm", # falcon-h1 "backbone.layers.{bid}.mixer.norm", # mamba2 ), MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", + "model.layers.{bid}.mamba.out_proj", # falcon-h1 ), MODEL_TENSOR.TIME_MIX_W0: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 9af9c2ad604d5..8f4f2df088f70 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -46,6 +46,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_MAMBA2, "mamba2" }, + { LLM_ARCH_FALCON_H1, "falcon-h1" }, { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COHERE2, "cohere2" }, @@ -1024,6 +1025,30 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, }, }, + { + LLM_ARCH_FALCON_H1, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" }, + { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" }, + { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" }, + { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" }, + { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" }, + { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, + { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_XVERSE, { @@ -1967,9 +1992,10 @@ bool llm_arch_is_recurrent(const llm_arch & arch) { } bool llm_arch_is_hybrid(const llm_arch & arch) { - // TODO: There are currently no hybrid models! Once there are, this will be - // the place to identify them + // List all mamba-attention hybrid models here switch (arch) { + case LLM_ARCH_FALCON_H1: + return true; default: return false; } diff --git a/src/llama-arch.h b/src/llama-arch.h index ba5d03fa24ebe..deb3bcd5bc0e3 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -50,6 +50,7 @@ enum llm_arch { LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, LLM_ARCH_MAMBA2, + LLM_ARCH_FALCON_H1, LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, LLM_ARCH_COHERE2, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fc4e9a5af004d..e424350bdd783 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1550,6 +1550,37 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_FALCON_H1: + { + // Common parameters + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + // SSM parameters + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner); + ml.get_key(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state); + ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank); + ml.get_key(LLM_KV_SSM_GROUP_COUNT, hparams.ssm_n_group); + + std::fill(hparams.recurrent_layer_arr.begin(), hparams.recurrent_layer_arr.end(), true); + + switch (hparams.n_layer) { + case 36: + type = LLM_TYPE_0_5B; break; + case 24: + type = LLM_TYPE_1_5B; break; + case 66: + type = LLM_TYPE_1B; break; + case 32: + type = LLM_TYPE_3B; break; + case 44: + type = LLM_TYPE_7B; break; + case 72: + type = LLM_TYPE_34B; break; + default: + type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_HUNYUAN_MOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -4497,6 +4528,83 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_FALCON_H1: + { + // Common + const int64_t hidden_size = hparams.n_embd; // hidden_size + + // mamba2 Mixer SSM params + const int64_t ssm_conv_kernel_size = hparams.ssm_d_conv; // ssm_conv_kernel_size + const int64_t ssm_n_groups = hparams.ssm_n_group; // ssm_n_groups + const int64_t ssm_state_size = hparams.ssm_d_state; // ssm_state_size + const int64_t ssm_intermediate_size = hparams.ssm_d_inner; // TODO expand + const int64_t ssm_num_heads = hparams.ssm_dt_rank; // ssm_num_heads + const int64_t ssm_conv_dim = ssm_intermediate_size + 2 * ssm_n_groups * ssm_state_size; + const int64_t ssm_projection_size = ssm_intermediate_size + ssm_conv_dim + ssm_num_heads; + + // attn params + const int64_t attn_num_attention_head = hparams.n_head(0); // rename to: attn_num_attention_head + const int64_t attn_num_key_value_head = hparams.n_head_kv(0); + + // ffn params + const int64_t ffn_intermediate_size = hparams.n_ff(0); + + // embeddings + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, 0); + + // output + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + /*SSM LAYERS*/ + // ssm in + layer.ssm_in = create_tensor(tn(LLM_TENSOR_SSM_IN, "weight", i), {hidden_size, ssm_projection_size}, 0); + // ssm 1d conv + layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {ssm_conv_kernel_size, ssm_conv_dim}, 0); + layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {ssm_conv_dim}, TENSOR_NOT_REQUIRED); + // ssm_dt + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {ssm_num_heads}, 0); + // no "weight" suffix for these + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, ssm_num_heads}, 0); + layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, ssm_num_heads}, 0); + // ssm_norm + layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {ssm_intermediate_size / ssm_n_groups, ssm_n_groups}, TENSOR_NOT_REQUIRED); + // out_proj + layer.ssm_out = create_tensor(tn(LLM_TENSOR_SSM_OUT, "weight", i), {ssm_intermediate_size, hidden_size}, 0); + + /*ATTENTION LAYERS*/ + // attention layers (with optional bias) + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {hidden_size, n_embd_head_k * attn_num_attention_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_k}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {hidden_size, attn_num_key_value_head * n_embd_head_v}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * attn_num_attention_head, hidden_size}, 0); + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {attn_num_key_value_head * n_embd_head_k}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {attn_num_key_value_head * n_embd_head_v}, TENSOR_NOT_REQUIRED); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {hidden_size}, 0); + + + // feed forward (w/ optional biases) + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, i), {hidden_size}, 0); + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {hidden_size, ffn_intermediate_size}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { ffn_intermediate_size, hidden_size}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {hidden_size, ffn_intermediate_size}, 0); + + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {hidden_size}, TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {ffn_intermediate_size}, TENSOR_NOT_REQUIRED); + } + } break; case LLM_ARCH_HUNYUAN_MOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -10147,7 +10255,7 @@ struct llm_build_mamba : public llm_graph_context { // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); - // cb(cur, "mamba_out", il); + cb(cur, "mamba_out", il); return cur; } @@ -14598,6 +14706,267 @@ struct llm_build_ernie4_5 : public llm_graph_context { } }; +struct llm_build_falcon_h1 : public llm_graph_context { + const llama_model & model; + + llm_build_falcon_h1(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + // Build the inputs in the recurrent & kv cache + auto * inp = build_inp_mem_hybrid(); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, hparams.rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur-post-rope", il); + cb(Kcur, "Kcur-post-rope", il); + cb(Vcur, "Vcur-post-rope", il); + + ggml_tensor * attn_out = build_attn(inp, gf, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(attn_out, "attn_out", il); + + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + // Mamba2 layer + cb(cur, "ssm_in", il); + + ggml_tensor * ssm_out = build_mamba2_layer(inp, gf, cur, ubatch, il); + cb(ssm_out, "ssm_out", il); + + // // Aggregation + cur = ggml_add(ctx0, attn_out, ssm_out); + inpSA = ggml_add(ctx0, cur, inpSA); + cb(cur, "layer_out", il); + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = inpSA; + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, inpSA); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + + ggml_tensor * build_mamba2_layer( + llm_graph_input_mem_hybrid * inp, + ggml_cgraph * gf, + ggml_tensor * cur, + const llama_ubatch & ubatch, + int il) const { + const auto * kv_state = static_cast(mctx)->get_recr(); + + const auto kv_head = kv_state->get_head(); + + const int64_t d_conv = hparams.ssm_d_conv; + const int64_t d_inner = hparams.ssm_d_inner; + const int64_t d_state = hparams.ssm_d_state; + const int64_t n_head = hparams.ssm_dt_rank; + const int64_t head_dim = d_inner / n_head; + const int64_t n_group = hparams.ssm_n_group; + const int64_t n_seqs = ubatch.n_seqs; + + const int64_t n_seq_tokens = ubatch.n_seq_tokens; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs); + GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); + + ggml_tensor * conv_states_all = kv_state->get_r_l(il); + ggml_tensor * ssm_states_all = kv_state->get_s_l(il); + + ggml_tensor * conv = build_rs(inp, gf, conv_states_all, hparams.n_embd_r(), n_seqs); + conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs); + + // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs} + cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs); + + // d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads + + // {n_embd, d_in_proj} @ {n_embd, n_seq_tokens, n_seqs} => {d_in_proj, n_seq_tokens, n_seqs} + ggml_tensor * zxBCdt = build_lora_mm(model.layers[il].ssm_in, cur); + cb(zxBCdt, "zxBCdt", il); + + // split the above in three + ggml_tensor * z = ggml_view_4d(ctx0, zxBCdt, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*zxBCdt->nb[0], zxBCdt->nb[1], zxBCdt->nb[2], 0); + ggml_tensor * xBC = ggml_view_3d(ctx0, zxBCdt, d_inner + 2*n_group*d_state, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], d_inner*ggml_element_size(zxBCdt)); + ggml_tensor * dt = ggml_view_3d(ctx0, zxBCdt, n_head, n_seq_tokens, n_seqs, zxBCdt->nb[1], zxBCdt->nb[2], (2*d_inner + 2*n_group*d_state)*ggml_element_size(zxBCdt)); + + // conv + { + // => {d_conv - 1 + n_seq_tokens, d_inner + 2*n_group*d_state, n_seqs} + ggml_tensor * conv_x = ggml_concat(ctx0, conv, ggml_transpose(ctx0, xBC), 0); + + // copy last (d_conv - 1) columns back into the state cache + ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner + 2*n_group*d_state, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); + + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, last_conv, + ggml_view_1d(ctx0, conv_states_all, + (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), + kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); + + // 1D convolution + // The equivalent is to make a self-overlapping view of conv_x + // over d_conv columns at each stride in the 3rd dimension, + // then element-wise multiply that with the conv1d weight, + // then sum the elements of each row, + // (the last two steps are a dot product over rows (also doable with mul_mat)) + // then permute away the ne[0] dimension, + // and then you're left with the resulting x tensor. + // For simultaneous sequences, all sequences need to have the same length. + xBC = ggml_ssm_conv(ctx0, conv_x, model.layers[il].ssm_conv1d); + + // bias + xBC = ggml_add(ctx0, xBC, model.layers[il].ssm_conv1d_b); + + xBC = ggml_silu(ctx0, xBC); + } + + // ssm + { + // These correspond to V K Q in SSM/attention duality + ggml_tensor * x = ggml_view_4d(ctx0, xBC, head_dim, n_head, n_seq_tokens, n_seqs, head_dim*xBC->nb[0], xBC->nb[1], xBC->nb[2], 0); + + ggml_tensor * B = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], d_inner*ggml_element_size(xBC)); + + ggml_tensor * C = ggml_view_4d(ctx0, xBC, d_state, n_group, n_seq_tokens, n_seqs, d_state*xBC->nb[0], xBC->nb[1], xBC->nb[2], (d_inner + n_group*d_state)*ggml_element_size(xBC)); + + // {n_head, n_seq_tokens, n_seqs} + dt = ggml_add(ctx0, ggml_cont(ctx0, dt), model.layers[il].ssm_dt_b); + + ggml_tensor * A = model.layers[il].ssm_a; + + // use the states and the indices provided by build_rs + // (this is necessary in order to properly use the states before they are overwritten, + // while avoiding to make unnecessary copies of the states) + auto get_ssm_rows = [&](ggml_context * ctx, ggml_tensor * states, ggml_tensor * ids) { + ggml_tensor * ssm = ggml_reshape_4d(ctx, states, d_state, head_dim, n_head, kv_state->get_size()); + + // TODO: use semistructured matrices to implement state-space duality + // => {d_inner, n_seq_tokens, n_seqs} and {d_state, d_inner, n_seqs} + return ggml_ssm_scan(ctx, ssm, x, dt, A, B, C, ids); + }; + + ggml_tensor * y_ssm = build_rs(inp, gf, ssm_states_all, hparams.n_embd_s(), ubatch.n_seqs, get_ssm_rows); + + // store last states + ggml_build_forward_expand(gf, + ggml_cpy(ctx0, + ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, ggml_nelements(x)*x->nb[0]), + ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); + + ggml_tensor * y = ggml_view_4d(ctx0, y_ssm, head_dim, n_head, n_seq_tokens, n_seqs, x->nb[1], n_head*x->nb[1], n_seq_tokens*n_head*x->nb[1], 0); + + // TODO: skip computing output earlier for unused tokens + + y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); + y = ggml_swiglu_split(ctx0, ggml_cont(ctx0, z), y); + + // grouped RMS norm + if (model.layers[il].ssm_norm) { + y = ggml_reshape_4d(ctx0, y, d_inner / n_group, n_group, n_seq_tokens, n_seqs); + y = build_norm(y, model.layers[il].ssm_norm, NULL, LLM_NORM_RMS, il); + } + + y = ggml_reshape_3d(ctx0, y, d_inner, n_seq_tokens, n_seqs); + + // {d_inner, n_embd} @ {d_inner, n_seq_tokens, n_seqs} => {n_embd, n_seq_tokens, n_seqs} + cur = build_lora_mm(model.layers[il].ssm_out, y); + } + + // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens} + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], n_seq_tokens * n_seqs); + cb(cur, "mamba_out", il); + return cur; + } +}; + struct llm_build_arcee : public llm_graph_context { llm_build_arcee(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -15077,7 +15446,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, /* recurrent_type_v */ GGML_TYPE_F32, /* recurrent_kv_size */ std::max((uint32_t) 1, cparams.n_seq_max), /* n_seq_max */ cparams.n_seq_max, - /* offload */ cparams.offload_kqv); + /* offload */ cparams.offload_kqv, + /* filter_attn */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr, + /* filter_recr */ (arch == LLM_ARCH_FALCON_H1) ? [&](int32_t) { return true; } : (llama_memory_hybrid::layer_filter_cb)nullptr); } else { const auto padding = llama_kv_cache_unified::get_padding(cparams); @@ -15419,6 +15790,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_FALCON_H1: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } @@ -15577,6 +15952,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: + case LLM_ARCH_FALCON_H1: case LLM_ARCH_GROK: case LLM_ARCH_DBRX: case LLM_ARCH_BERT: diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 551bba171c0e0..b7f14dc07b609 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1523,6 +1523,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| tokenizer_pre == "falcon3" || + tokenizer_pre == "falcon-h1" || tokenizer_pre == "pixtral") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; From 00d2fc2141cc7095e8b0ecb45b0e5be20cd9d325 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 9 Jul 2025 10:19:50 +0200 Subject: [PATCH 09/32] llama : remove unintended whitespace (#14592) Signed-off-by: ryan-mangeno --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e424350bdd783..4468c837f1c28 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4555,7 +4555,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // output output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hidden_size, n_vocab}, TENSOR_NOT_REQUIRED); output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {hidden_size}, 0); - + // if output is NULL, init from the input tok embed if (output == NULL) { output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hidden_size, n_vocab}, TENSOR_DUPLICATED); From e1b0b9ed072d531ad2a9846341c1b9553794c303 Mon Sep 17 00:00:00 2001 From: Dowon Date: Wed, 9 Jul 2025 17:22:31 +0900 Subject: [PATCH 10/32] model : add skt/A.X-4.0 model vocabulary (#14589) Signed-off-by: ryan-mangeno --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + src/llama-vocab.cpp | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4dedc020b61e1..702827f4d5d2e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -818,6 +818,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "7e57df22b1fe23a7b1e1c7f3dc4e3f96d43a4eb0836d0c6bdc3436d7b2f1c664": # ref: https://huggingface.co/tencent/Hunyuan-A13B-Instruct res = "hunyuan" + if chkhsh == "b0a6b1c0bd5998ebd9df08611efde34a4ff03faed45ae09c43e6b31ebd4b94cf": + # ref: https://huggingface.co/skt/A.X-4.0 + res = "a.x-4.0" if chkhsh == "a6b57017d60e6edb4d88ecc2845188e0eb333a70357e45dcc9b53964a73bbae6": # ref: https://huggingface.co/tiiuae/Falcon-H1-0.5B-Base res = "falcon-h1" diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 15a326e695dd5..b8cb6027d6de5 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, + {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index b7f14dc07b609..6aa1d901c5e36 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1556,7 +1556,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "jina-de" || tokenizer_pre == "gigachat" || tokenizer_pre == "jina-v2-es" || - tokenizer_pre == "jina-v2-de") { + tokenizer_pre == "jina-v2-de" || + tokenizer_pre == "a.x-4.0") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "jina-v1-en" || From ea265fe47027807dec637fb2d753c315b330dc89 Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Tue, 8 Jul 2025 16:35:49 -0400 Subject: [PATCH 11/32] support for smoldocling Signed-off-by: ryan-mangeno --- convert_hf_to_gguf_update.py | 1 + gguf-py/gguf/tensor_mapping.py | 17 ++++++++++++++++- include/llama.h | 1 + src/llama-model.cpp | 2 ++ src/llama-model.h | 1 + src/llama-vocab.cpp | 13 +++++++++++++ 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 2f733f0973686..1c3748297c382 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -128,6 +128,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, + {"name": "smoldocling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", }, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b30f77dbe3be7..c769ea664b036 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -32,6 +32,7 @@ class TensorNameMap: "model.word_embeddings", # bailingmoe "language_model.model.embed_tokens", # llama4 "encoder", # neobert + "model.text_model.embed_tokens.weight", # smoldocling ), # Token type embeddings @@ -63,7 +64,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe - "output", # llama-pth bloom internlm2 + "output", # llama-pth bloom internlm2 smoldocling "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 "output_layer", # chatglm @@ -93,6 +94,7 @@ class TensorNameMap: "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 + "output_norm", # smoldocling ), # Rope frequencies @@ -136,6 +138,7 @@ class TensorNameMap: "model.layers.{bid}.ln1", # rwkv7 "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert + "blk.{bid}.attn_norm", # smoldocling ), # Attention norm 2 @@ -179,6 +182,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone "model.layers.{bid}.self_attn.q_proj", # llama4 + "blk.{bid}.attn_q", # smoldocling ), # Attention key @@ -195,6 +199,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone "model.layers.{bid}.self_attn.k_proj", # llama4 + "blk.{bid}.attn_k", # smoldocling ), # Attention value @@ -210,6 +215,8 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone "model.layers.{bid}.self_attn.v_proj", # llama4 + "blk.{bid}.attn_v", # smoldocling + ), # Attention output @@ -240,6 +247,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.attention.out_proj", # exaone "model.layers.{bid}.self_attn.o_proj", # llama4 "transformer_encoder.{bid}.wo", # neobert + "blk.{bid}.attn_output", # smoldocling ), # Attention output norm @@ -249,6 +257,7 @@ class TensorNameMap: "encoder.layers.{bid}.norm1", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx + "blk.{bid}.attn_norm", # smoldocling ), MODEL_TENSOR.ATTN_POST_NORM: ( @@ -281,6 +290,7 @@ class TensorNameMap: "transformer.layers.{bid}.ffn_norm", # openelm "model.layers.{bid}.post_attention_layernorm", # llama4 "transformer_encoder.{bid}.ffn_norm", # neobert + "blk.{bid}.ffn_norm", # smoldocling ), # Post feed-forward norm @@ -346,6 +356,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc_1", # exaone "model.layers.{bid}.feed_forward.up_proj", # llama4 "transformer_encoder.{bid}.ffn.w12", # neobert + "blk.{bid}.ffn_up", # smoldocling ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -383,6 +394,8 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 + "blk.{bid}.ffn_gate", # smoldocling + ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -429,6 +442,8 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 "transformer_encoder.{bid}.ffn.w3", # neobert + "blk.{bid}.ffn_down", # smoldocling + ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/include/llama.h b/include/llama.h index 3eda9bc68608c..d920732622b81 100644 --- a/include/llama.h +++ b/include/llama.h @@ -117,6 +117,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, + LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 36, }; enum llama_rope_type { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index fc39195ed5177..a2aa7856a169e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -40,6 +40,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_190M: return "190M"; case LLM_TYPE_220M: return "220M"; case LLM_TYPE_250M: return "250M"; + case LLM_TYPE_256M: return "256M"; case LLM_TYPE_270M: return "270M"; case LLM_TYPE_335M: return "335M"; case LLM_TYPE_410M: return "410M"; @@ -575,6 +576,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { case 22: type = LLM_TYPE_1B; break; case 26: type = LLM_TYPE_3B; break; case 28: type = LLM_TYPE_3B; break; // Llama 3.2 3B + case 30: type = LLM_TYPE_256M; break; // smoldocling 256M // granite uses a vocab with len 49152 case 32: type = n_vocab == 49152 ? LLM_TYPE_3B : (n_vocab < 40000 ? LLM_TYPE_7B : LLM_TYPE_8B); break; case 36: type = LLM_TYPE_8B; break; // granite diff --git a/src/llama-model.h b/src/llama-model.h index 40063b790d434..9258e7cf65f2b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -32,6 +32,7 @@ enum llm_type { LLM_TYPE_190M, LLM_TYPE_220M, LLM_TYPE_250M, + LLM_TYPE_256M, // smoldocling LLM_TYPE_270M, LLM_TYPE_335M, LLM_TYPE_410M, diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 5c9eb87566dde..995e87da607ce 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -424,6 +424,13 @@ struct llm_tokenizer_bpe : llm_tokenizer { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING: + // uses digits and byte level pre tokenizers defined in the pre_tokenizer section of + // https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json + regex_exprs = { + "[0-9]", + "[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+", + }; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1656,6 +1663,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "seed-coder") { pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; + } else if ( + tokenizer_pre == "smoldocling") { + pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -1839,6 +1850,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "_" || t.first == "<|end▁of▁sentence|>" // DeepSeek + || t.first == "" // smoldocling ) { special_eot_id = t.second; if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { @@ -1998,6 +2010,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { || t.first == "" || t.first == "_" || t.first == "<|end_of_text|>" + || t.first == "" // smoldocling ) { special_eog_ids.insert(t.second); if ((id_to_token[t.second].attr & LLAMA_TOKEN_ATTR_CONTROL) == 0) { From 529552d83ba90c53a3aab85fa9dbd2e00f8c317e Mon Sep 17 00:00:00 2001 From: Miaoqian Lin Date: Wed, 9 Jul 2025 20:33:53 +0800 Subject: [PATCH 12/32] ggml : prevent integer overflow in gguf tensor size calculation (#14595) Signed-off-by: ryan-mangeno --- ggml/src/gguf.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/ggml/src/gguf.cpp b/ggml/src/gguf.cpp index 5ffd12b8b2795..53504399c57f4 100644 --- a/ggml/src/gguf.cpp +++ b/ggml/src/gguf.cpp @@ -631,7 +631,14 @@ struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_par gguf_free(ctx); return nullptr; } - ctx->size += GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment); + size_t padded_size = GGML_PAD(ggml_nbytes(&ti.t), ctx->alignment); + if (SIZE_MAX - ctx->size < padded_size) { + GGML_LOG_ERROR("%s: tensor '%s' size overflow, cannot accumulate size %zu + %zu\n", + __func__, ti.t.name, ctx->size, padded_size); + gguf_free(ctx); + return nullptr; + } + ctx->size += padded_size; } } From 996c578760661129b0c5fb70a7e5cd0c335ce5bc Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Tue, 8 Jul 2025 16:47:52 -0400 Subject: [PATCH 13/32] fixed merge conflicts Signed-off-by: ryan-mangeno --- include/llama.h | 3 ++- src/llama-vocab.cpp | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index d920732622b81..045453e0e4107 100644 --- a/include/llama.h +++ b/include/llama.h @@ -117,7 +117,8 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33, LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, - LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 36, + LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, + LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 37, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 995e87da607ce..dc247383fd87c 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -351,6 +351,7 @@ struct llm_tokenizer_bpe : llm_tokenizer { break; case LLAMA_VOCAB_PRE_TYPE_STABLELM2: case LLAMA_VOCAB_PRE_TYPE_QWEN2: + case LLAMA_VOCAB_PRE_TYPE_HUNYUAN: regex_exprs = { // original regex from tokenizer.json // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" @@ -1663,6 +1664,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "seed-coder") { pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER; clean_spaces = false; + } else if ( + tokenizer_pre == "hunyuan") { + pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; + clean_spaces = false; } else if ( tokenizer_pre == "smoldocling") { pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; From c971ea81f56930d60d216a8c3f04a4507b834e38 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:14:57 -0400 Subject: [PATCH 14/32] Update src/llama-vocab.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-vocab.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 50b30be04201b..aec702382b966 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1673,10 +1673,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "smoldocling") { pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; clean_spaces = false; - } else if ( - tokenizer_pre == "smoldocling") { - pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; - clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From ca3cc83dbf59ce42348308f4a39cc81d77d619c6 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:15:08 -0400 Subject: [PATCH 15/32] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d90677aee1ce8..995b1b09a88e0 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -216,7 +216,6 @@ class TensorNameMap: "transformer.h.{bid}.attn.attention.v_proj", # exaone "model.layers.{bid}.self_attn.v_proj", # llama4 "blk.{bid}.attn_v", # smoldocling - ), # Attention output From b9b532cc267be43f608f325caa94faa0b732cb53 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:15:15 -0400 Subject: [PATCH 16/32] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 995b1b09a88e0..544cc7560f526 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -450,8 +450,7 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 jamba "transformer_encoder.{bid}.ffn.w3", # neobert - "blk.{bid}.ffn_down", # smoldocling - + "blk.{bid}.ffn_down", # smoldocling ), MODEL_TENSOR.FFN_DOWN_EXP: ( From 376ea470b86e2ff8987f7d5aac97a1378087df38 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Wed, 9 Jul 2025 19:15:22 -0400 Subject: [PATCH 17/32] Update src/llama-model.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-model.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.h b/src/llama-model.h index 68584085bd9e7..431efbd516783 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -32,7 +32,7 @@ enum llm_type { LLM_TYPE_190M, LLM_TYPE_220M, LLM_TYPE_250M, - LLM_TYPE_256M, // smoldocling + LLM_TYPE_256M, LLM_TYPE_270M, LLM_TYPE_335M, LLM_TYPE_410M, From 5856f8c024af7b3c2686baf90c1ca17c49ad5bfd Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Wed, 9 Jul 2025 19:42:33 -0400 Subject: [PATCH 18/32] safetensors tensor mapping Signed-off-by: ryan-mangeno --- gguf-py/gguf/tensor_mapping.py | 56 ++++++++++++++++------------------ 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 544cc7560f526..5226b50fe2f53 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -32,7 +32,7 @@ class TensorNameMap: "model.word_embeddings", # bailingmoe "language_model.model.embed_tokens", # llama4 "encoder", # neobert - "model.text_model.embed_tokens.weight", # smoldocling + "model.text_model.embed_tokens", # smoldocling ), # Token type embeddings @@ -64,7 +64,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx jais nemotron exaone olmoe olmo2 phimoe - "output", # llama-pth bloom internlm2 smoldocling + "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 "output_layer", # chatglm @@ -94,7 +94,6 @@ class TensorNameMap: "model.ln_out", # rwkv7 "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 - "output_norm", # smoldocling ), # Rope frequencies @@ -138,7 +137,7 @@ class TensorNameMap: "model.layers.{bid}.ln1", # rwkv7 "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert - "blk.{bid}.attn_norm", # smoldocling + "model.text_model.layers.{bid}.input_layernorm", # smoldocling ), # Attention norm 2 @@ -182,7 +181,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone "model.layers.{bid}.self_attn.q_proj", # llama4 - "blk.{bid}.attn_q", # smoldocling + "model.text_model.layers.{bid}.self_attn.q_proj", # smoldocling ), # Attention key @@ -199,7 +198,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone "model.layers.{bid}.self_attn.k_proj", # llama4 - "blk.{bid}.attn_k", # smoldocling + "model.text_model.layers.{bid}.self_attn.k_proj", # smoldocling ), # Attention value @@ -215,7 +214,7 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone "model.layers.{bid}.self_attn.v_proj", # llama4 - "blk.{bid}.attn_v", # smoldocling + "model.text_model.layers.{bid}.self_attn.v_proj", # smoldocling ), # Attention output @@ -246,7 +245,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.attention.out_proj", # exaone "model.layers.{bid}.self_attn.o_proj", # llama4 "transformer_encoder.{bid}.wo", # neobert - "blk.{bid}.attn_output", # smoldocling + "model.text_model.layers.{bid}.self_attn.o_proj", # smoldocling ), # Attention output norm @@ -260,8 +259,9 @@ class TensorNameMap: ), MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge - "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414 + "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge + "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414 + "model.text_model.layers.{bid}.post_attention_layernorm", # smoldocling ), # Rotary embeddings @@ -291,7 +291,6 @@ class TensorNameMap: "model.layers.{bid}.pre_moe_layernorm", # mini-jamba "model.layers.{bid}.post_attention_layernorm", # llama4 "transformer_encoder.{bid}.ffn_norm", # neobert - "blk.{bid}.ffn_norm", # smoldocling ), # Post feed-forward norm @@ -360,8 +359,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc_1", # exaone "model.layers.{bid}.feed_forward.up_proj", # llama4 jamba "transformer_encoder.{bid}.ffn.w12", # neobert - "blk.{bid}.ffn_up", # smoldocling - "blk.{bid}.ffn_up", # smoldocling + "model.text_model.layers.{bid}.mlp.up_proj", # smoldocling ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -389,20 +387,20 @@ class TensorNameMap: # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "transformer.h.{bid}.mlp.c_fc2", # jais - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) - "transformer.h.{bid}.mlp.linear_1", # refact - "model.layers.{bid}.residual_mlp.w1", # arctic - "transformer.h.{bid}.mlp.c_fc_0", # exaone - "model.layers.{bid}.feed_forward.gate_proj", # llama4 - "blk.{bid}.ffn_gate", # smoldocling - "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) + "transformer.h.{bid}.mlp.linear_1", # refact + "model.layers.{bid}.residual_mlp.w1", # arctic + "transformer.h.{bid}.mlp.c_fc_0", # exaone + "model.layers.{bid}.feed_forward.gate_proj", # llama4 + "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba + "model.text_model.layers.{bid}.mlp.gate_proj", # smoldocling ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -450,7 +448,7 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 jamba "transformer_encoder.{bid}.ffn.w3", # neobert - "blk.{bid}.ffn_down", # smoldocling + "model.text_model.layers.{bid}.mlp.down_proj", # smoldocling ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -1038,7 +1036,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_MMPROJ_FC: ( - "model.connector.modality_projection.proj", # SmolVLM + "model.connector.modality_projection.proj", # SmolVLM smoldocling ), MODEL_TENSOR.V_MMPROJ_MLP: ( From 5c3d94862860d117e283b8b4fae3f96bc610f0bd Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Thu, 10 Jul 2025 00:31:26 -0400 Subject: [PATCH 19/32] added back accidental removal of clean spaces for hunyuan --- src/llama-vocab.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index aec702382b966..a4dd2936c919a 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1669,6 +1669,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "hunyuan") { pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; + clean_spaces = false; } else if ( tokenizer_pre == "smoldocling") { pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; From 69fa6722091e60be4b8bedf7cb70130f54853da1 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 09:28:20 -0400 Subject: [PATCH 20/32] Update src/llama-vocab.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-vocab.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a4dd2936c919a..16982a950b0a9 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -429,9 +429,10 @@ struct llm_tokenizer_bpe : llm_tokenizer { // uses digits and byte level pre tokenizers defined in the pre_tokenizer section of // https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json regex_exprs = { - "[0-9]", - "[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+", - }; + "[0-9]", + "[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+", + }; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { From f5e4d56022d9c8d77a06352aae53900b06a6c39e Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Thu, 10 Jul 2025 09:50:43 -0400 Subject: [PATCH 21/32] updated hash and reordererd model list --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2419126ec4ea2..8fc67f99a4ad9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -833,6 +833,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base res = "falcon-h1" + if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": + # ref: https://huggingface.co/ds4sd/SmolDocling-256M-preview + res = "smoldocling" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index c99c4742cfb23..98bbe04b0931b 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -128,8 +128,8 @@ class TOKENIZER_TYPE(IntEnum): {"name": "llama4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", }, {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, - {"name": "smoldocling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", }, {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", }, + {"name": "smoldocling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", }, ] # some models are known to be broken upstream, so we will skip them as exceptions From 037dbbc33d98beb1c39673c4a5735199e5c0f379 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:27:37 -0400 Subject: [PATCH 22/32] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5226b50fe2f53..0cd712631bc54 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -398,7 +398,6 @@ class TensorNameMap: "transformer.h.{bid}.mlp.linear_1", # refact "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone - "model.layers.{bid}.feed_forward.gate_proj", # llama4 "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba "model.text_model.layers.{bid}.mlp.gate_proj", # smoldocling ), From 4331939356935fca6f2b68e67f15e7b2a3f7cba1 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:27:50 -0400 Subject: [PATCH 23/32] Update src/llama-vocab.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-vocab.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 16982a950b0a9..01f51c1caa03e 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1671,10 +1671,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "hunyuan") { pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN; clean_spaces = false; - } else if ( - tokenizer_pre == "smoldocling") { - pre_type = LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING; - clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } From 661abf595d108bf9df0163586e95d1d5e6670eb9 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:27:57 -0400 Subject: [PATCH 24/32] Update include/llama.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- include/llama.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/llama.h b/include/llama.h index 045453e0e4107..dc86aea41dcbd 100644 --- a/include/llama.h +++ b/include/llama.h @@ -118,7 +118,6 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34, LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35, LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36, - LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING = 37, }; enum llama_rope_type { From a3be53cfa438d1d4b71bbd3bdab7491f760ab9bd Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:28:04 -0400 Subject: [PATCH 25/32] Update convert_hf_to_gguf.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8fc67f99a4ad9..2419126ec4ea2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -833,9 +833,6 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base res = "falcon-h1" - if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": - # ref: https://huggingface.co/ds4sd/SmolDocling-256M-preview - res = "smoldocling" if res is None: logger.warning("\n") From 859005d8d2e045e4d9559fa2d16664b60a5b800e Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:28:11 -0400 Subject: [PATCH 26/32] Update convert_hf_to_gguf_update.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf_update.py | 1 - 1 file changed, 1 deletion(-) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 98bbe04b0931b..b8cb6027d6de5 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -129,7 +129,6 @@ class TOKENIZER_TYPE(IntEnum): {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", }, - {"name": "smoldocling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ds4sd/SmolDocling-256M-preview", }, ] # some models are known to be broken upstream, so we will skip them as exceptions From db54ea56f4066462c864e491c3df858d4134fb7d Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:28:17 -0400 Subject: [PATCH 27/32] Update src/llama-vocab.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- src/llama-vocab.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 01f51c1caa03e..10823b183da88 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -425,14 +425,6 @@ struct llm_tokenizer_bpe : llm_tokenizer { "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; - case LLAMA_VOCAB_PRE_TYPE_SMOLDOCLING: - // uses digits and byte level pre tokenizers defined in the pre_tokenizer section of - // https://huggingface.co/ds4sd/SmolDocling-256M-preview/raw/main/tokenizer.json - regex_exprs = { - "[0-9]", - "[a-zA-Z0-9_]+|[^a-zA-Z0-9_\\s]+", - }; - break; default: // default regex for BPE tokenization pre-processing regex_exprs = { From f021d1900c075cac7353404b61975f9e78b8804d Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Thu, 10 Jul 2025 10:30:17 -0400 Subject: [PATCH 28/32] removed old tensor name --- gguf-py/gguf/tensor_mapping.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 0cd712631bc54..7fd200a4323be 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -255,7 +255,6 @@ class TensorNameMap: "encoder.layers.{bid}.norm1", # nomic-bert "transformer.decoder_layer.{bid}.rms_norm_1", # Grok "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx - "blk.{bid}.attn_norm", # smoldocling ), MODEL_TENSOR.ATTN_POST_NORM: ( From 3f4c53225fd24b92c45135bef05eff0591862825 Mon Sep 17 00:00:00 2001 From: ryan-mangeno Date: Thu, 10 Jul 2025 10:44:13 -0400 Subject: [PATCH 29/32] removed tensor mappings -> handled by smolvlm --- gguf-py/gguf/tensor_mapping.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 7fd200a4323be..3a33d7fd32d58 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -32,7 +32,6 @@ class TensorNameMap: "model.word_embeddings", # bailingmoe "language_model.model.embed_tokens", # llama4 "encoder", # neobert - "model.text_model.embed_tokens", # smoldocling ), # Token type embeddings @@ -137,7 +136,6 @@ class TensorNameMap: "model.layers.{bid}.ln1", # rwkv7 "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert - "model.text_model.layers.{bid}.input_layernorm", # smoldocling ), # Attention norm 2 @@ -181,7 +179,6 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.query",# Grok "transformer.h.{bid}.attn.attention.q_proj", # exaone "model.layers.{bid}.self_attn.q_proj", # llama4 - "model.text_model.layers.{bid}.self_attn.q_proj", # smoldocling ), # Attention key @@ -198,7 +195,6 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.key",# Grok "transformer.h.{bid}.attn.attention.k_proj", # exaone "model.layers.{bid}.self_attn.k_proj", # llama4 - "model.text_model.layers.{bid}.self_attn.k_proj", # smoldocling ), # Attention value @@ -214,7 +210,6 @@ class TensorNameMap: "transformer.decoder_layer.{bid}.multi_head_attention.value",# Grok "transformer.h.{bid}.attn.attention.v_proj", # exaone "model.layers.{bid}.self_attn.v_proj", # llama4 - "model.text_model.layers.{bid}.self_attn.v_proj", # smoldocling ), # Attention output @@ -245,7 +240,6 @@ class TensorNameMap: "transformer.h.{bid}.attn.attention.out_proj", # exaone "model.layers.{bid}.self_attn.o_proj", # llama4 "transformer_encoder.{bid}.wo", # neobert - "model.text_model.layers.{bid}.self_attn.o_proj", # smoldocling ), # Attention output norm @@ -260,7 +254,6 @@ class TensorNameMap: MODEL_TENSOR.ATTN_POST_NORM: ( "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414 - "model.text_model.layers.{bid}.post_attention_layernorm", # smoldocling ), # Rotary embeddings @@ -358,7 +351,6 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc_1", # exaone "model.layers.{bid}.feed_forward.up_proj", # llama4 jamba "transformer_encoder.{bid}.ffn.w12", # neobert - "model.text_model.layers.{bid}.mlp.up_proj", # smoldocling ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -398,7 +390,6 @@ class TensorNameMap: "model.layers.{bid}.residual_mlp.w1", # arctic "transformer.h.{bid}.mlp.c_fc_0", # exaone "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba - "model.text_model.layers.{bid}.mlp.gate_proj", # smoldocling ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -446,7 +437,6 @@ class TensorNameMap: "model.layers.h.{bid}.mlp.c_proj", # exaone "model.layers.{bid}.feed_forward.down_proj", # llama4 jamba "transformer_encoder.{bid}.ffn.w3", # neobert - "model.text_model.layers.{bid}.mlp.down_proj", # smoldocling ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -1034,7 +1024,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_MMPROJ_FC: ( - "model.connector.modality_projection.proj", # SmolVLM smoldocling + "model.connector.modality_projection.proj", # SmolVLM ), MODEL_TENSOR.V_MMPROJ_MLP: ( From 310eb46400cf1216abe39289eda427d3db10f64c Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:51:03 -0400 Subject: [PATCH 30/32] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 3a33d7fd32d58..b49d63733fe10 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -252,8 +252,8 @@ class TensorNameMap: ), MODEL_TENSOR.ATTN_POST_NORM: ( - "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge - "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414 + "model.layers.{bid}.post_attention_layernorm", # gemma2 olmo2 # ge + "model.layers.{bid}.post_self_attn_layernorm", # glm-4-0414 ), # Rotary embeddings From bcf4ad35ae1d2832f30596b452deefa9ef99f1c7 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:51:09 -0400 Subject: [PATCH 31/32] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b49d63733fe10..8994f1d6c1f01 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -378,18 +378,18 @@ class TensorNameMap: # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "transformer.h.{bid}.mlp.c_fc2", # jais - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) - "transformer.h.{bid}.mlp.linear_1", # refact - "model.layers.{bid}.residual_mlp.w1", # arctic - "transformer.h.{bid}.mlp.c_fc_0", # exaone - "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) + "transformer.h.{bid}.mlp.linear_1", # refact + "model.layers.{bid}.residual_mlp.w1", # arctic + "transformer.h.{bid}.mlp.c_fc_0", # exaone + "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba ), MODEL_TENSOR.FFN_GATE_EXP: ( From a4f662ddae41f4aef9063d56c6a55e792dd69aa1 Mon Sep 17 00:00:00 2001 From: Ryan Mangeno <160974989+ryan-mangeno@users.noreply.github.com> Date: Thu, 10 Jul 2025 10:51:15 -0400 Subject: [PATCH 32/32] Update gguf-py/gguf/tensor_mapping.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Sigbjørn Skjæret --- gguf-py/gguf/tensor_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 8994f1d6c1f01..215eb297ebcc1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1024,7 +1024,7 @@ class TensorNameMap: ), MODEL_TENSOR.V_MMPROJ_FC: ( - "model.connector.modality_projection.proj", # SmolVLM + "model.connector.modality_projection.proj", # SmolVLM ), MODEL_TENSOR.V_MMPROJ_MLP: (