Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 81 additions & 19 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,8 +419,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
def load_hparams(dir_model: Path):
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
architectures = hparams.get("architectures")
if "text_config" in hparams:
hparams = {**hparams, **hparams["text_config"]}
if architectures is not None:
# preserve "architectures" from root level config
hparams["architectures"] = architectures
return hparams

@classmethod
Expand Down Expand Up @@ -1061,6 +1065,8 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
class VisionModel(ModelBase):
model_arch = gguf.MODEL_ARCH.CLIP_VISION
n_text_embd = 0
preprocessor_config: dict[str, Any]
global_config: dict[str, Any]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -1075,24 +1081,33 @@ def __init__(self, *args, **kwargs):

if "vision_config" not in self.hparams:
raise ValueError("vision_config not found in hparams")
# move vision config to the top level
# move vision config to the top level, while preserving the original hparams in global_config
self.global_config = self.hparams
self.hparams = self.hparams["vision_config"]

# load preprocessor config
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)

def set_type(self):
self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)

def set_gguf_parameters(self):
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text)
self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True)
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
self.gguf_writer.add_vision_has_vision_encoder(True)

# vision config
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE, self.find_hparam(["image_size"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE, self.find_hparam(["patch_size"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH, self.find_hparam(["hidden_size"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH, self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT, self.find_hparam(["num_hidden_layers"]))
self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"]))
self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"]))
self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))

# preprocessor config
self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"])

def write_vocab(self):
raise ValueError("VisionModel does not support vocab writing")
Expand Down Expand Up @@ -1703,11 +1718,23 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed norms: {norms}")


@ModelBase.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
@ModelBase.register(
"LLaMAForCausalLM",
"LlamaForCausalLM",
"MistralForCausalLM",
"MixtralForCausalLM",
"Idefics3ForConditionalGeneration",
"SmolVLMForConditionalGeneration")
class LlamaModel(TextModel):
model_arch = gguf.MODEL_ARCH.LLAMA
undo_permute = True

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# fix for SmolVLM2, missing `num_attention_heads` in config.json
if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)

def set_vocab(self):
try:
self._set_vocab_sentencepiece()
Expand Down Expand Up @@ -1770,6 +1797,12 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name

if is_vision_tensor:
return [] # skip vision tensors
elif name.startswith("model.text_model"):
name = name.replace("text_model.", "") # for SmolVLM

if self.undo_permute:
if name.endswith(("q_proj.weight", "q_proj.bias")):
Expand Down Expand Up @@ -1852,6 +1885,41 @@ def prepare_tensors(self):
raise ValueError(f"Unprocessed experts: {experts}")


@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
class SmolVLMModel(VisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# fix for SmolVLM2, missing some keys in config.json
# default values are taken from transformers code
if self.hparams["model_type"] == "smolvlm_vision":
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)
Comment on lines +1894 to +1898
Copy link
Collaborator Author

@ngxson ngxson Apr 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@compilade recently, I have seen many models missing keys in config.json. Just wondering, at some points, should we use AutoConfig in load_hparams to prevent this from happening?


def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
self.gguf_writer.add_vision_use_gelu(True)

def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, new_name, n_dims # unused
if ".embeddings." in name:
return gguf.GGMLQuantizationType.F32
return False

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name

if is_vision_tensor:
return [(self.map_tensor_name(name), data_torch)]

return [] # skip other tensors


@ModelBase.register("Llama4ForConditionalGeneration")
class Llama4Model(LlamaModel):
model_arch = gguf.MODEL_ARCH.LLAMA4
Expand Down Expand Up @@ -3591,12 +3659,10 @@ class Gemma3VisionModel(VisionModel):
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3")
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
# default values below are taken from HF tranformers code
self.gguf_writer.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5])
self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_STD, [0.5, 0.5, 0.5])
self.gguf_writer.add_bool (gguf.Keys.ClipVision.USE_GELU, True)
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
self.gguf_writer.add_vision_use_gelu(True)

def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, new_name, n_dims # unused
Expand All @@ -3614,10 +3680,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
# process vision tensors
name = name.replace("_weight", ".weight")
if "fc1" in name:
name = name.replace("fc1", "fc2")
else:
name = name.replace("fc2", "fc1")

# correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
# the other norm values are part of SigLIP model, and they are already correct
Expand Down
7 changes: 5 additions & 2 deletions examples/llava/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@
#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
#define KEY_PROJ_DIM "clip.%s.projection_dim"
#define KEY_TOKENS "tokenizer.ggml.tokens"
#define KEY_N_POSITIONS "clip.text.context_length"
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
#define KEY_PROJ_TYPE "clip.projector_type"
#define KEY_FEATURE_LAYER "clip.vision.feature_layer"
#define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor"
#define KEY_PROJ_TYPE "clip.projector_type"

#define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type"
#define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints"
Expand Down Expand Up @@ -72,6 +72,7 @@
#define TN_IMAGE_NEWLINE "model.image_newline"
#define TN_MM_INP_PROJ "mm.input_projection.weight" // gemma3
#define TN_MM_SOFT_EMB_N "mm.soft_emb_norm.weight" // gemma3
#define TN_MM_PROJECTOR "mm.model.fc.weight" // idefics3

// mimicpmv
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
Expand Down Expand Up @@ -99,6 +100,7 @@ enum projector_type {
PROJECTOR_TYPE_GLM_EDGE,
PROJECTOR_TYPE_MERGER,
PROJECTOR_TYPE_GEMMA3,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_UNKNOWN,
};

Expand All @@ -110,6 +112,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_GLM_EDGE, "adapter"},
{ PROJECTOR_TYPE_MERGER, "qwen2vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down
Loading
Loading