Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 28 additions & 2 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3348,7 +3348,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_file_type(self.ftype)


@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "DotsOCRForCausalLM")
class Qwen2Model(TextModel):
model_arch = gguf.MODEL_ARCH.QWEN2

Expand All @@ -3374,7 +3374,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
name = name.replace("language_model.", "") # for InternVL
if name.startswith("mlp") or name.startswith("multi_modal_projector") \
or name.startswith("vision_model") or name.startswith("audio_tower") \
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"):
or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector") \
or name.startswith("vision_tower."):
# skip vision and audio tensors
return []
yield from super().modify_tensors(data_torch, name, bid)
Expand Down Expand Up @@ -10074,6 +10075,31 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return []


@ModelBase.register("DotsOCRForCausalLM")
class DotsOCRVisionModel(MmprojModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
assert self.hparams_vision is not None
self.hparams_vision["image_size"] = 0 # dynamic resolution

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.DOTSOCR)
self.gguf_writer.add_vision_image_min_pixels(self.preprocessor_config["min_pixels"])
self.gguf_writer.add_vision_image_max_pixels(self.preprocessor_config["max_pixels"])
self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["rms_norm_eps"]))
self.gguf_writer.add_vision_projector_scale_factor(self.find_vparam(["spatial_merge_size"]))
self.gguf_writer.add_vision_use_silu(True)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

if name.startswith("vision_tower."):
return [(self.map_tensor_name(name), data_torch)]

return [] # skip other tensors


###### CONVERSION LOGIC ######


Expand Down
3 changes: 3 additions & 0 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,8 @@ class Clip:

class ClipVision:
IMAGE_SIZE = "clip.vision.image_size"
IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels"
IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels"
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
PATCH_SIZE = "clip.vision.patch_size"
EMBEDDING_LENGTH = "clip.vision.embedding_length"
Expand Down Expand Up @@ -3304,6 +3306,7 @@ class VisionProjectorType:
LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm"
JANUS_PRO = "janus_pro"
DOTSOCR = "dots_ocr"


# Items here are (block size, type size)
Expand Down
6 changes: 6 additions & 0 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,12 @@ def add_vision_n_wa_pattern(self, value: int) -> None:
def add_vision_is_deepstack_layers(self, layers: Sequence[bool]) -> None:
self.add_array(Keys.ClipVision.IS_DEEPSTACK_LAYERS, layers)

def add_vision_image_max_pixels(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.IMAGE_MAX_PIXELS, value)

def add_vision_image_min_pixels(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value)

# audio models

def add_audio_projection_dim(self, value: int) -> None:
Expand Down
12 changes: 12 additions & 0 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ: (
"multi_modal_projector.linear_{bid}",
"visual.merger.mlp.{bid}", # qwen2vl
"vision_tower.merger.mlp.{bid}", # dots.ocr
),

MODEL_TENSOR.V_MMPROJ_FC: (
Expand Down Expand Up @@ -1225,6 +1226,7 @@ class TensorNameMap:
"visual.patch_embed.proj", # qwen2vl
"vision_tower.patch_embed.proj", # kimi-vl
"model.vision.patch_embedding.proj", # cogvlm
"vision_tower.patch_embed.patchifier.proj", # dots.ocr
),

MODEL_TENSOR.V_ENC_EMBD_POS: (
Expand All @@ -1240,6 +1242,7 @@ class TensorNameMap:

MODEL_TENSOR.V_ENC_ATTN_QKV: (
"visual.blocks.{bid}.attn.qkv", # qwen3vl
"vision_tower.blocks.{bid}.attn.qkv", # dots.ocr
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
),

Expand Down Expand Up @@ -1301,6 +1304,7 @@ class TensorNameMap:
"visual.blocks.{bid}.norm1", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
"vision_tower.blocks.{bid}.norm1", # dots.ocr
),

MODEL_TENSOR.V_ENC_ATTN_O: (
Expand All @@ -1316,6 +1320,7 @@ class TensorNameMap:
"visual.blocks.{bid}.attn.proj", # qwen2vl
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
"vision_tower.blocks.{bid}.attn.proj", # dots.ocr
),

MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
Expand All @@ -1330,6 +1335,7 @@ class TensorNameMap:
"visual.blocks.{bid}.norm2", # qwen2vl
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
"vision_tower.blocks.{bid}.norm2", # dots.ocr
),

MODEL_TENSOR.V_ENC_FFN_UP: (
Expand All @@ -1345,12 +1351,14 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc1", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
"vision_tower.blocks.{bid}.mlp.fc3", # dots.ocr
),

MODEL_TENSOR.V_ENC_FFN_GATE: (
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
"vision_tower.blocks.{bid}.mlp.fc1", # dots.ocr
),

MODEL_TENSOR.V_ENC_FFN_DOWN: (
Expand All @@ -1366,6 +1374,7 @@ class TensorNameMap:
"visual.blocks.{bid}.mlp.linear_fc2", # qwen3vl
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
"vision_tower.blocks.{bid}.mlp.fc2", # dots.ocr
),

MODEL_TENSOR.V_LAYER_SCALE_1: (
Expand All @@ -1383,6 +1392,7 @@ class TensorNameMap:
"vision_tower.ln_pre", # pixtral-hf
"vision_encoder.ln_pre", # pixtral
"vision_model.layernorm_pre", # llama4
"vision_tower.patch_embed.patchifier.norm", # dots.ocr
),

MODEL_TENSOR.V_POST_NORM: (
Expand All @@ -1391,6 +1401,7 @@ class TensorNameMap:
"vision_model.layernorm_post", # llama4
"visual.merger.ln_q", # qwen2vl
"vision_tower.encoder.final_layernorm", # kimi-vl
"vision_tower.post_trunk_norm", # dots.ocr
),

MODEL_TENSOR.V_MM_INP_PROJ: (
Expand All @@ -1403,6 +1414,7 @@ class TensorNameMap:
"multi_modal_projector.pre_norm",
"pre_mm_projector_norm",
"model.vision.linear_proj.norm1", # cogvlm
"vision_tower.merger.ln_q", # dots.ocr
),

MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
Expand Down
4 changes: 4 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
// vision-specific
#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels"
#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels"
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
Expand Down Expand Up @@ -156,6 +158,7 @@ enum projector_type {
PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_COGVLM,
PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_DOTS_OCR,
PROJECTOR_TYPE_UNKNOWN,
};

Expand All @@ -182,6 +185,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
{ PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
{ PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"},
};

static projector_type clip_projector_type_from_string(const std::string & str) {
Expand Down
Loading
Loading