From 2787888e4d6fde368affa8fe5647dbe2427f9be3 Mon Sep 17 00:00:00 2001
From: liyang <liyang2@uniontech.com>
Date: Wed, 22 Oct 2025 16:38:24 +0800
Subject: [PATCH] mtmd/clip+converter: address #16574; fold CLI into mtmd-cli,
 fix converter keys, rope_ext + bicubic

mtmd-cli: move the standalone Jina CLI into mtmd-cli (projector-only path); drop the extra binary.
---
 convert_hf_to_gguf.py      | 325 +++++++++++++++++++++-
 gguf-py/gguf/constants.py  |   1 +
 tools/mtmd/clip-impl.h     |  47 ++--
 tools/mtmd/clip.cpp        | 541 ++++++++++++++++++++++++++++++++-----
 tools/mtmd/clip.h          |   1 +
 tools/mtmd/mtmd-cli.cpp    | 147 +++++++++-
 tools/mtmd/mtmd-helper.cpp |  14 +
 tools/mtmd/mtmd-helper.h   |   5 +
 tools/mtmd/mtmd.cpp        |  95 +++++++
 tools/mtmd/mtmd.h          |  30 ++
 10 files changed, 1097 insertions(+), 109 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ed99dc8477231..0ebe5eea99e11 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5026,7 +5026,18 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
 
         if lora_names := hparams.get("lora_adaptations"):
             self._lora_names = lora_names
-            self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
+
+        try:
+            text_cfg = hparams.get("text_config", {}) if isinstance(hparams.get("text_config", {}), dict) else {}
+            pe_type = (text_cfg.get("position_embedding_type") or hparams.get("position_embedding_type") or "").lower()
+            rope_base = text_cfg.get("rotary_emb_base", hparams.get("rotary_emb_base"))
+            name_path = (hparams.get("_name_or_path") or "").lower()
+            is_vx = ("jina" in name_path and ("v2" in name_path or "v3" in name_path))
+            is_v3 = (pe_type == "rotary" or rope_base is not None) and is_vx
+            if (is_v3) or self._lora_names:
+                self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3
+        except Exception:
+            pass
 
         super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
         self._xlmroberta_tokenizer_init()
@@ -6248,6 +6259,293 @@ def set_vocab(self):
             raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
 
 
+@ModelBase.register("JinaCLIPVisionModel", "JinaCLIPModel")
+class JinaCLIPVisionModel(MmprojModel):
+    """JinaCLIP v2 Vision Encoder Model - handles vision component only"""
+    model_arch = gguf.MODEL_ARCH.MMPROJ
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # Load config for vision encoder
+        config_path = self.dir_model / "config.json"
+        if config_path.exists():
+            with open(config_path, encoding="utf-8") as f:
+                self.vision_config = json.load(f)
+        else:
+            # Default JinaCLIP v2 vision configuration
+            self.vision_config = {
+                "image_size": 448,
+                "patch_size": 14,
+                "hidden_size": 1024,
+                "num_hidden_layers": 24,
+                "num_attention_heads": 16,
+                "intermediate_size": 2731,
+                "layer_norm_eps": 1e-5,
+                "projection_dim": 1024
+            }
+
+    def set_vocab(self):
+        # Vision encoder doesn't need vocabulary
+        pass
+
+    def set_gguf_parameters(self):
+        # Identification (arch/name is set by writer); mark vision encoder presence
+        self.gguf_writer.add_clip_has_vision_encoder(True)
+
+        # Vision parameters
+        config = self.vision_config
+        img_sz = int(config.get("image_size", 448))
+        patch_sz = int(config.get("patch_size", 14))
+        n_embd = int(config.get("hidden_size", 1024))
+        n_layer = int(config.get("num_hidden_layers", 24))
+        n_head = int(config.get("num_attention_heads", 16))
+        n_ff = int(config.get("intermediate_size", 2731))
+        proj_dim = int(config.get("projection_dim", 1024))
+
+        # Use gguf writer helpers (constants + typed setters)
+        self.gguf_writer.add_vision_image_size(img_sz)
+        self.gguf_writer.add_vision_patch_size(patch_sz)
+        self.gguf_writer.add_vision_embedding_length(n_embd)
+        self.gguf_writer.add_vision_block_count(n_layer)
+        self.gguf_writer.add_vision_projection_dim(proj_dim)
+        self.gguf_writer.add_vision_feed_forward_length(n_ff)
+        self.gguf_writer.add_vision_head_count(n_head)
+        # LayerNorm epsilon comes from config (fallback 1e-5)
+        eps_attn = float(config.get("layer_norm_eps", 1e-5))
+        self.gguf_writer.add_vision_attention_layernorm_eps(eps_attn)
+
+        # Preprocessing defaults
+        self.gguf_writer.add_vision_image_mean([0.48145466, 0.4578275, 0.40821073])
+        self.gguf_writer.add_vision_image_std ([0.26862954, 0.26130258, 0.27577711])
+
+        # Projector type and activation
+        # JinaCLIP v2 projector type string follows upstream style (family+major)
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JINACLIP2)
+        self.gguf_writer.add_vision_use_silu(True)
+
+        # RoPE parameter used by vision encoder (prefer config override)
+        try:
+            rt = config.get("rope_theta", None)
+            rope_theta = float(rt) if rt is not None else 10000.0
+        except Exception:
+            rope_theta = 10000.0
+        # writer currently has no dedicated setter for this key; keep direct write
+        self.gguf_writer.add_float32("clip.vision.rope_theta", rope_theta)
+
+        # Compatibility (mmproj) — not covered by gguf writer helpers yet
+        self.gguf_writer.add_uint32("mmproj.embedding_length", n_embd)
+        self.gguf_writer.add_uint32("mmproj.block_count", n_layer)
+
+        logger.info(
+            "mmproj(jinaclip): image_size=%d patch_size=%d n_embd=%d n_layer=%d n_head=%d n_ff=%d proj_dim=%d",
+            img_sz, patch_sz, n_embd, n_layer, n_head, n_ff, proj_dim
+        )
+
+    # helpers to keep modify_tensors compact and consistent with other models
+    def _strip_vm_prefix(self, name: str) -> str:
+        return name[len('vision_model.'):] if name.startswith('vision_model.') else name
+
+    def _map_block_tensor(self, layer: int, rest: str, data_torch: Tensor, name: str) -> list[tuple[str, Tensor]] | None:
+        parts = rest.split('.')
+        # layer norms
+        if rest.startswith('norm1.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.ln_1.{suffix}', data_torch)]
+        if rest.startswith('norm2.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.ln_2.{suffix}', data_torch)]
+        if rest.startswith('attn.inner_attn_ln.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.attn_ln.{suffix}', data_torch)]
+
+        # fused qkv
+        if rest == 'attn.qkv.weight':
+            w = data_torch
+            wdim = w.shape[0]
+            if wdim % 3 != 0:
+                logger.warning('mmproj(jinaclip): unexpected qkv weight shape %s for %s', tuple(w.shape), name)
+            d = wdim // 3
+            q, k, v = w[0:d, :], w[d:2 * d, :], w[2 * d:, :]
+            return [
+                (f'v.blk.{layer}.attn_q.weight', q),
+                (f'v.blk.{layer}.attn_k.weight', k),
+                (f'v.blk.{layer}.attn_v.weight', v),
+            ]
+        if rest == 'attn.qkv.bias':
+            b = data_torch
+            bdim = b.shape[0]
+            if bdim % 3 != 0:
+                logger.warning('mmproj(jinaclip): unexpected qkv bias shape %s for %s', tuple(b.shape), name)
+            d = bdim // 3
+            qb, kb, vb = b[0:d], b[d:2 * d], b[2 * d:]
+            return [
+                (f'v.blk.{layer}.attn_q.bias', qb),
+                (f'v.blk.{layer}.attn_k.bias', kb),
+                (f'v.blk.{layer}.attn_v.bias', vb),
+            ]
+        # separate q/v bias (some checkpoints)
+        if rest == 'attn.q_bias':
+            return [(f'v.blk.{layer}.attn_q.bias', data_torch)]
+        if rest == 'attn.v_bias':
+            return [(f'v.blk.{layer}.attn_v.bias', data_torch)]
+
+        # separate projections
+        if rest.startswith('attn.q_proj.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.attn_q.{suffix}', data_torch)]
+        if rest.startswith('attn.k_proj.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.attn_k.{suffix}', data_torch)]
+        if rest.startswith('attn.v_proj.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.attn_v.{suffix}', data_torch)]
+        if rest.startswith('attn.proj.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.attn_out.{suffix}', data_torch)]
+
+        # MLP
+        if rest.startswith('mlp.w1.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.ffn_gate.{suffix}', data_torch)]
+        if rest.startswith('mlp.w2.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)]
+        if rest.startswith('mlp.w3.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)]
+        if rest.startswith('mlp.ffn_ln.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.ffn_norm.{suffix}', data_torch)]
+        if rest.startswith('mlp.fc1.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)]
+        if rest.startswith('mlp.fc2.'):
+            suffix = parts[-1]
+            return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)]
+        return None
+
+    def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
+        """Prefer base table-driven mapping; keep Jina-specific targets if already mapped; fallback to legacy mapper."""
+        # Already a GGUF target name (e.g., "v.*" or "mm.*"): return as-is
+        if name.startswith('v.') or name.startswith('mm.'):
+            return name
+        # Try the base mapping first
+        try:
+            return super().map_tensor_name(name, try_suffixes=try_suffixes)
+        except Exception:
+            # Fallback to legacy Jina-specific mapper for any remaining edge keys
+            if hasattr(self, "_map_jinaclip_tensor_name"):
+                mapped = self._map_jinaclip_tensor_name(name)  # type: ignore[attr-defined]
+                if mapped:
+                    return mapped
+            return name
+
+    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
+        """Yield tensors for the vision encoder.
+
+        Prefer the base implementation (supports sharded/indexed weights). If that fails
+        or no parts are detected, fall back to a direct single-file load.
+        """
+        # Try base path (indexed/sharded)
+        try:
+            if getattr(self, "part_names", None):
+                for name, tensor in super().get_tensors():
+                    yield name, tensor
+                return
+        except Exception as e:
+            logger.warning("mmproj(jinaclip): base get_tensors failed, falling back to direct load: %s", e)
+
+        # Fallback: direct single-file load
+        import torch
+        candidates = [
+            self.dir_model / "pytorch_model.bin",
+            self.dir_model / "model.safetensors",
+            self.dir_model / "pytorch_model.safetensors",
+            self.dir_model / "vision_model_weights.bin",
+        ]
+        model_path = next((p for p in candidates if p.exists()), None)
+        if model_path is None:
+            raise FileNotFoundError(f"mmproj(jinaclip): no model weights found in {self.dir_model}")
+
+        logger.info("mmproj(jinaclip): loading weights from %s", model_path)
+        if model_path.suffix == ".bin":
+            state_dict = torch.load(model_path, map_location="cpu", weights_only=True)
+        else:
+            from safetensors.torch import load_file
+            state_dict = load_file(str(model_path))
+
+        count = 0
+        for name, tensor in state_dict.items():
+            # yield raw names; modify_tensors will normalize & map
+            yield name, tensor
+            count += 1
+
+        logger.info("mmproj(jinaclip): yielded %d raw tensors", count)
+
+    def _should_be_f32(self, gguf_name: str) -> bool:
+        """Return True if tensor should be stored as F32 to avoid type mismatches in C++ runtime.
+
+        Keep the list minimal: LayerNorm weights/bias are the common source of
+        binary-op dtype issues; patch embedding bias is also safer as F32.
+        """
+        patterns = (
+            ".ln_1.weight", ".ln_1.bias",
+            ".ln_2.weight", ".ln_2.bias",
+            ".attn_ln.weight", ".attn_ln.bias",
+            ".ffn_norm.weight", ".ffn_norm.bias",
+            "v.patch_embd.proj.bias",
+        )
+        return any(p in gguf_name for p in patterns)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        """Normalize JinaCLIP vision tensor names to base-friendly patterns, with Jina-specific exceptions.
+
+        - Emit Jina-specific targets directly for: patch/proj, pos_embed, inner-attn LN, SwiGLU FFN names.
+        - If fused QKV is encountered, split into Q/K/V.
+        - For standard pieces (norm1/norm2, q/k/v/out), map to v.blk.{i}.* targets.
+        """
+        del bid  # unused
+
+        src = name
+        if src.startswith('v.') or src.startswith('mm.'):
+            return [(src, data_torch)]
+
+        # Drop 'vision_model.' prefix if present
+        src_no_vm = self._strip_vm_prefix(src)
+
+        # Top-level direct mappings — use gguf constants directly for canonical names
+        if src_no_vm == 'cls_token':
+            base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_CLS]
+            return [(base, data_torch)]
+        if src_no_vm.startswith('patch_embed.proj.'):
+            suffix = src_no_vm.split('.')[-1]
+            base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH]
+            return [(f'{base}.{suffix}', data_torch)]
+        if src_no_vm == 'pos_embed':
+            pos_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_POS] + '.weight'
+            return [(pos_name, data_torch)]
+        if src_no_vm.startswith('norm.'):
+            suffix = src_no_vm.split('.')[-1]
+            base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_POST_NORM]
+            return [(f'{base}.{suffix}', data_torch)]
+
+        if src_no_vm.startswith('blocks.'):
+            parts = src_no_vm.split('.')
+            if len(parts) >= 3 and parts[1].isdigit():
+                layer = int(parts[1])
+                rest = '.'.join(parts[2:])
+                mapped = self._map_block_tensor(layer, rest, data_torch, name)
+                if mapped is not None:
+                    return mapped
+
+        try:
+            return [(self.map_tensor_name(name), data_torch)]
+        except Exception:
+            logger.debug("mmproj(jinaclip): skip unmapped tensor %s", name)
+            return []
+
+
 @ModelBase.register("OpenELMForCausalLM")
 class OpenELMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.OPENELM
@@ -9614,16 +9912,21 @@ def main() -> None:
         else:
             model_class = MistralModel
 
-        model_instance = model_class(dir_model, output_type, fname_out,
-                                     is_big_endian=args.bigendian, use_temp_file=args.use_temp_file,
-                                     eager=args.no_lazy,
-                                     metadata_override=args.metadata, model_name=args.model_name,
-                                     split_max_tensors=args.split_max_tensors,
-                                     split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
-                                     small_first_shard=args.no_tensor_first_split,
-                                     remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template,
-                                     sentence_transformers_dense_modules=args.sentence_transformers_dense_modules
-                                     )
+        common_kwargs = dict(
+            is_big_endian=args.bigendian,
+            use_temp_file=args.use_temp_file,
+            eager=args.no_lazy,
+            metadata_override=args.metadata,
+            model_name=args.model_name,
+            split_max_tensors=args.split_max_tensors,
+            split_max_size=split_str_to_n_bytes(args.split_max_size),
+            dry_run=args.dry_run,
+            small_first_shard=args.no_tensor_first_split,
+            remote_hf_model_id=hf_repo_id,
+            disable_mistral_community_chat_template=disable_mistral_community_chat_template,
+            sentence_transformers_dense_modules=args.sentence_transformers_dense_modules,
+        )
+        model_instance = model_class(dir_model, output_type, fname_out, **common_kwargs)
 
         if args.vocab_only:
             logger.info("Exporting model vocab...")
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 1b71fb3749aaa..6a284a1cd8441 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -3057,6 +3057,7 @@ class VisionProjectorType:
     QWEN25VL = "qwen2.5vl_merger"
     ULTRAVOX = "ultravox"
     INTERNVL = "internvl"
+    JINACLIP2 = "jinaclip2"
     QWEN2A = "qwen2a" # audio
     QWEN25O = "qwen2.5o" # omni
     VOXTRAL = "voxtral"
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 1669fad99b36b..86b19c53c5203 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -39,6 +39,8 @@
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
 #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
 #define KEY_SPATIAL_MERGE_SIZE  "clip.vision.spatial_merge_size"
+// RoPE base (theta) for vision encoder
+#define KEY_VISION_ROPE_THETA   "clip.vision.rope_theta"
 
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@@ -67,14 +69,15 @@
 #define TN_ATTN_Q          "%s.blk.%d.attn_q.%s"
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
 #define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
+#define TN_ATTN_LN         "%s.blk.%d.attn_ln.%s"  // inner attention LayerNorm
 #define TN_ATTN_K_NORM     "%s.blk.%d.attn_k_norm.%s"
 #define TN_ATTN_Q_NORM     "%s.blk.%d.attn_q_norm.%s"
 #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
 #define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
-#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
-#define TN_LN_1            "%s.blk.%d.ln1.%s" // layer norm
-#define TN_LN_2            "%s.blk.%d.ln2.%s" // layer norm
+#define TN_FFN_NORM        "%s.blk.%d.ffn_norm.%s"
+#define TN_LN_1            "%s.blk.%d.ln_1.%s"  // layer norm
+#define TN_LN_2            "%s.blk.%d.ln_2.%s"  // layer norm
 #define TN_LS_1            "%s.blk.%d.ls1.%s" // layer scale
 #define TN_LS_2            "%s.blk.%d.ls2.%s" // layer scale
 #define TN_LN_PRE          "%s.pre_ln.%s"
@@ -137,30 +140,32 @@ enum projector_type {
     PROJECTOR_TYPE_QWEN2A,
     PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
     PROJECTOR_TYPE_VOXTRAL,
+    PROJECTOR_TYPE_JINACLIP2, // JinaCLIP v2
     PROJECTOR_TYPE_LFM2,
     PROJECTOR_TYPE_KIMIVL,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
-    { PROJECTOR_TYPE_MLP,       "mlp" },
-    { PROJECTOR_TYPE_LDP,       "ldp" },
-    { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
-    { PROJECTOR_TYPE_MINICPMV,  "resampler"},
-    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
-    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
-    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
-    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
-    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
-    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
-    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
-    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
-    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"},
-    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
-    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
-    { PROJECTOR_TYPE_LFM2,      "lfm2"},
-    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
+    { PROJECTOR_TYPE_MLP,      "mlp"              },
+    { PROJECTOR_TYPE_LDP,      "ldp"              },
+    { PROJECTOR_TYPE_LDPV2,    "ldpv2"            },
+    { PROJECTOR_TYPE_MINICPMV, "resampler"        },
+    { PROJECTOR_TYPE_GLM_EDGE, "adapter"          },
+    { PROJECTOR_TYPE_QWEN2VL,  "qwen2vl_merger"   },
+    { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger" },
+    { PROJECTOR_TYPE_GEMMA3,   "gemma3"           },
+    { PROJECTOR_TYPE_IDEFICS3, "idefics3"         },
+    { PROJECTOR_TYPE_PIXTRAL,  "pixtral"          },
+    { PROJECTOR_TYPE_ULTRAVOX, "ultravox"         },
+    { PROJECTOR_TYPE_INTERNVL, "internvl"         },
+    { PROJECTOR_TYPE_LLAMA4,   "llama4"           },
+    { PROJECTOR_TYPE_QWEN2A,   "qwen2a"           },
+    { PROJECTOR_TYPE_QWEN25O,  "qwen2.5o"         },
+    { PROJECTOR_TYPE_VOXTRAL,  "voxtral"          },
+    { PROJECTOR_TYPE_JINACLIP2, "jinaclip2"        },
+    { PROJECTOR_TYPE_LFM2,     "lfm2"             },
+    { PROJECTOR_TYPE_KIMIVL,   "kimivl"           },
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f2abf88523843..0c736b79e7182 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -218,6 +218,10 @@ struct clip_layer {
     ggml_tensor * o_w = nullptr;
     ggml_tensor * o_b = nullptr;
 
+    // inner attention LayerNorm (JinaCLIP specific)
+    ggml_tensor * attn_ln_w = nullptr;
+    ggml_tensor * attn_ln_b = nullptr;
+
     ggml_tensor * k_norm = nullptr;
     ggml_tensor * q_norm = nullptr;
 
@@ -232,6 +236,10 @@ struct clip_layer {
     ggml_tensor * ff_down_w = nullptr;
     ggml_tensor * ff_down_b = nullptr;
 
+    // FFN sub-layer norm (GGUF: v.blk.{i}.ffn_norm.{weight,bias})
+    ggml_tensor * ffn_norm_w = nullptr;
+    ggml_tensor * ffn_norm_b = nullptr;
+
     // layernorm 2
     ggml_tensor * ln_2_w = nullptr;
     ggml_tensor * ln_2_b = nullptr;
@@ -424,9 +432,8 @@ struct clip_ctx {
         backend_ptrs.push_back(backend_cpu);
         backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu));
 
-        sched.reset(
-            ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true)
-        );
+        sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes,
+                                           false, true));
     }
 
     ~clip_ctx() {
@@ -762,8 +769,8 @@ struct clip_graph {
 
                 ggml_tensor * attn_mask = full_attn ? nullptr : window_mask;
 
-                cur = build_attn(layer.o_w, layer.o_b,
-                    Qcur, Kcur, Vcur, attn_mask, kq_scale, il);
+                cur = build_attn(layer.o_w, layer.o_b, layer.attn_ln_w, layer.attn_ln_b, Qcur, Kcur, Vcur, attn_mask,
+                                 kq_scale, il);
                 cb(cur, "attn_out", il);
             }
 
@@ -779,11 +786,8 @@ struct clip_graph {
             cb(cur, "ffn_inp_normed", il);
 
             // ffn
-            cur = build_ffn(cur,
-                layer.ff_up_w, layer.ff_up_b,
-                layer.ff_gate_w, layer.ff_gate_b,
-                layer.ff_down_w, layer.ff_down_b,
-                hparams.ffn_op, il);
+            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w,
+                            layer.ff_down_b, nullptr, nullptr, hparams.ffn_op, il);
 
             cb(cur, "ffn_out", il);
 
@@ -895,10 +899,9 @@ struct clip_graph {
             cb(K, "resampler_K", -1);
             cb(V, "resampler_V", -1);
 
-            embeddings = build_attn(
-                model.mm_model_attn_o_w,
-                model.mm_model_attn_o_b,
-                Q, K, V, nullptr, kq_scale, -1);
+            embeddings = build_attn(model.mm_model_attn_o_w, model.mm_model_attn_o_b, nullptr,
+                                    nullptr,  // no inner attn LayerNorm for resampler
+                                    Q, K, V, nullptr, kq_scale, -1);
             cb(embeddings, "resampler_attn_out", -1);
         }
         // layernorm
@@ -1229,8 +1232,7 @@ struct clip_graph {
                 cb(Kcur, "Kcur", il);
                 cb(Vcur, "Vcur", il);
 
-                cur = build_attn(layer.o_w, layer.o_b,
-                    Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                cur = build_attn(layer.o_w, layer.o_b, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, kq_scale, il);
                 cb(cur, "attn_out", il);
             }
 
@@ -1246,11 +1248,8 @@ struct clip_graph {
             cb(cur, "ffn_inp_normed", il);
 
             // ffn
-            cur = build_ffn(cur,
-                layer.ff_up_w, layer.ff_up_b,
-                layer.ff_gate_w, layer.ff_gate_b,
-                layer.ff_down_w, layer.ff_down_b,
-                hparams.ffn_op, il);
+            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w,
+                            layer.ff_down_b, nullptr, nullptr, hparams.ffn_op, il);
 
             cb(cur, "ffn_out", il);
 
@@ -1509,6 +1508,177 @@ struct clip_graph {
         return gf;
     }
 
+    ggml_cgraph * build_jina() {
+        const int n_pos = n_patches + (model.class_embedding ? 1 : 0);
+
+        GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported");
+
+        // Calculate the deepest feature layer based on hparams and projector type
+        int max_feature_layer = n_layer;
+
+        ggml_tensor * inp = build_inp();
+
+    if (ctx->proj_type() == PROJECTOR_TYPE_JINACLIP2) {
+            ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+            ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+            ggml_set_name(pos_h, "pos_h");
+            ggml_set_name(pos_w, "pos_w");
+            ggml_set_input(pos_h);
+            ggml_set_input(pos_w);
+            ggml_build_forward_expand(gf, pos_h);
+            ggml_build_forward_expand(gf, pos_w);
+
+            // per-dim frequency scaling vectors for RoPE halves (default c-tensor path)
+            ggml_tensor * rope_c_first  = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head/2);
+            ggml_tensor * rope_c_second = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head/2);
+            ggml_set_name(rope_c_first,  "rope_c_first");
+            ggml_set_name(rope_c_second, "rope_c_second");
+            ggml_set_input(rope_c_first);
+            ggml_set_input(rope_c_second);
+            ggml_build_forward_expand(gf, rope_c_first);
+            ggml_build_forward_expand(gf, rope_c_second);
+
+        }
+        // concat class_embeddings and patch_embeddings
+        if (model.class_embedding) {
+            inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
+        }
+
+        // Position encoding setup - different for JinaCLIP (2D RoPE) vs others (1D position embeddings)
+        ggml_tensor * positions = nullptr;
+
+        // JinaCLIP uses BOTH position embeddings AND 2D RoPE
+        // 1) add learned position embeddings (x = x + pos_embed)
+        positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
+        ggml_set_name(positions, "positions");
+        ggml_set_input(positions);
+
+        inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions));
+
+        ggml_tensor * inpL = inp;
+
+        // pre-layernorm
+        if (model.pre_ln_w) {
+            inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1);
+        }
+        std::vector<ggml_tensor *> embedding_stack;
+        const auto &               vision_feature_layer = hparams.vision_feature_layer;
+
+        // loop over layers
+        for (int il = 0; il < max_feature_layer; il++) {
+            auto &        layer = model.layers[il];
+            ggml_tensor * cur   = inpL;  // inpL = residual, cur = hidden_states
+
+            // If this is an embedding feature layer, save the output.
+            // NOTE: 0 index here refers to the input to the encoder.
+            if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
+                embedding_stack.push_back(cur);
+            }
+
+            // layernorm
+            cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il);
+
+            // self-attention
+            {
+                ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur);
+                if (layer.q_b) {
+                    Qcur = ggml_add(ctx0, Qcur, layer.q_b);
+                }
+
+                ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur);
+                if (layer.k_b) {
+                    Kcur = ggml_add(ctx0, Kcur, layer.k_b);
+                }
+
+                ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur);
+                if (layer.v_b) {
+                    Vcur = ggml_add(ctx0, Vcur, layer.v_b);
+                }
+                Qcur = ggml_reshape_4d(ctx0, Qcur, d_head, n_head, n_pos, 1);
+                Kcur = ggml_reshape_4d(ctx0, Kcur, d_head, n_head, n_pos, 1);
+                Vcur = ggml_reshape_4d(ctx0, Vcur, d_head, n_head, n_pos, 1);
+
+                ggml_tensor * Q_rope_in = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+                ggml_tensor * K_rope_in = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
+
+                // Apply 2D RoPE position encoding for JinaCLIP (only to patch tokens, skip CLS token)
+                // JinaCLIP has CLS token at position 0, only apply RoPE to patch tokens (position 1+)
+                // Following Python implementation: q[:, :, 1:, :] gets RoPE applied
+
+                // Split Q: CLS token (pos 0) + patch tokens (pos 1+)
+                ggml_tensor * Q_cls =
+                    ggml_view_3d(ctx0, Q_rope_in, d_head, 1, n_head, Q_rope_in->nb[1], Q_rope_in->nb[2], 0);
+                ggml_tensor * Q_patches = ggml_view_3d(ctx0, Q_rope_in, d_head, n_pos - 1, n_head, Q_rope_in->nb[1],
+                                                       Q_rope_in->nb[2], Q_rope_in->nb[1]);
+
+                // Split K: CLS token (pos 0) + patch tokens (pos 1+)
+                ggml_tensor * K_cls =
+                    ggml_view_3d(ctx0, K_rope_in, d_head, 1, n_head, K_rope_in->nb[1], K_rope_in->nb[2], 0);
+                ggml_tensor * K_patches = ggml_view_3d(ctx0, K_rope_in, d_head, n_pos - 1, n_head, K_rope_in->nb[1],
+                                                       K_rope_in->nb[2], K_rope_in->nb[1]);
+
+                int pt_seq_len = 16;  // fallback pretrain length
+                if (hparams.patch_size > 0) {
+                    int cand = (int) llroundf(224.0f / (float) hparams.patch_size);
+                    if (cand > 0) {
+                        pt_seq_len = cand;
+                    }
+                }
+                const int hw_seq_len = static_cast<int>(sqrtf(n_pos - 1));  // image grid size (excluding CLS)
+                Q_patches = build_jinaclip_rope(ctx0, ctx, Q_patches, pt_seq_len, hw_seq_len, hparams.rope_theta, true,
+                                                true, il);
+                K_patches = build_jinaclip_rope(ctx0, ctx, K_patches, pt_seq_len, hw_seq_len, hparams.rope_theta, true,
+                                                false, il);
+                GGML_ASSERT(Q_cls->ne[0] == Q_patches->ne[0]);
+                GGML_ASSERT(Q_cls->ne[2] == Q_patches->ne[2]);
+                GGML_ASSERT(Q_cls->ne[3] == Q_patches->ne[3]);
+
+                // Recombine: CLS token + RoPE-processed patch tokens (seq dimension is 1 now)
+                ggml_tensor * Q_rope_out = ggml_concat(ctx0, Q_cls, Q_patches, 1);
+                ggml_tensor * K_rope_out = ggml_concat(ctx0, K_cls, K_patches, 1);
+
+                // Restore to pre-permute layout [dim, head, seq, 1]
+                Qcur = ggml_permute(ctx0, Q_rope_out, 0, 2, 1, 3);
+                Kcur = ggml_permute(ctx0, K_rope_out, 0, 2, 1, 3);
+
+                cur = build_attn(layer.o_w, layer.o_b, layer.attn_ln_w, layer.attn_ln_b, Qcur, Kcur, Vcur, nullptr,
+                                 kq_scale, il);
+            }
+
+            // re-add the layer input, e.g., residual
+            cur = ggml_add(ctx0, cur, inpL);
+            cb(inpL, "inp_after_attn", il);
+
+            inpL = cur;  // inpL = residual, cur = hidden_states
+
+            cb(cur, "ffn_inp", il);
+
+            // layernorm2
+            cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il);
+            cb(cur, "ffn_inp_normed", il);
+            // ffn
+            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w,
+                            layer.ff_down_b, layer.ffn_norm_w, layer.ffn_norm_b, hparams.ffn_op, il);
+
+            // residual 2
+            cur = ggml_add(ctx0, inpL, cur);
+
+            inpL = cur;
+        }
+
+        // post-layernorm
+        if (model.post_ln_w) {
+            inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1);
+        }
+        // final
+
+        ggml_tensor * emb2d = ggml_reshape_2d(ctx0, inpL, inpL->ne[0], inpL->ne[1]);
+        ggml_tensor * cls   = ggml_view_2d(ctx0, emb2d, emb2d->ne[0], /*rows=*/1, emb2d->nb[1], /*offset=*/0);
+        ggml_set_name(cls, "cls_view");
+        ggml_build_forward_expand(gf, cls);
+        return gf;
+    }
+
     // whisper encoder with custom projector
     ggml_cgraph * build_whisper_enc() {
         const int n_frames = img.nx;
@@ -1727,8 +1897,8 @@ struct clip_graph {
                     cb(Kcur, "Kcur_pos", il);
                 }
 
-                cur = build_attn(layer.o_w, layer.o_b,
-                    Qcur, Kcur, Vcur, nullptr, kq_scale, il);
+                cur = build_attn(layer.o_w, layer.o_b, layer.attn_ln_w, layer.attn_ln_b, Qcur, Kcur, Vcur, nullptr,
+                                 kq_scale, il);
                 cb(cur, "attn_out", il);
             }
 
@@ -1749,11 +1919,8 @@ struct clip_graph {
             cb(cur, "ffn_inp_normed", il);
 
             // ffn
-            cur = build_ffn(cur,
-                layer.ff_up_w, layer.ff_up_b,
-                layer.ff_gate_w, layer.ff_gate_b,
-                layer.ff_down_w, layer.ff_down_b,
-                ffn_t, il);
+            cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w,
+                            layer.ff_down_b, nullptr, nullptr, ffn_t, il);
 
             cb(cur, "ffn_out", il);
 
@@ -1837,17 +2004,17 @@ struct clip_graph {
         return cur;
     }
 
-    ggml_tensor * build_ffn(
-            ggml_tensor * cur,
-            ggml_tensor * up,
-            ggml_tensor * up_b,
-            ggml_tensor * gate,
-            ggml_tensor * gate_b,
-            ggml_tensor * down,
-            ggml_tensor * down_b,
-            ffn_op_type type_op,
-            int il) const {
-
+    ggml_tensor * build_ffn(ggml_tensor * cur,
+                            ggml_tensor * up,
+                            ggml_tensor * up_b,
+                            ggml_tensor * gate,
+                            ggml_tensor * gate_b,
+                            ggml_tensor * down,
+                            ggml_tensor * down_b,
+                            ggml_tensor * ffn_norm_w,
+                            ggml_tensor * ffn_norm_b,
+                            ffn_op_type   type_op,
+                            int           il) const {
         ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur;
         cb(tmp, "ffn_up", il);
 
@@ -1904,6 +2071,12 @@ struct clip_graph {
                 } break;
         }
 
+        // Apply FFN sub-layer LayerNorm if provided: x = ffn_ln(x)
+        if (ffn_norm_w || ffn_norm_b) {
+            cur = build_norm(cur, ffn_norm_w, ffn_norm_b, NORM_TYPE_NORMAL, eps, il);
+            cb(cur, "ffn_norm", il);
+        }
+
         if (down) {
             cur = ggml_mul_mat(ctx0, down, cur);
         }
@@ -1914,20 +2087,22 @@ struct clip_graph {
 
         if (down_b) {
             cur = ggml_add(ctx0, cur, down_b);
+            cb(cur, "ffn_down_b", il);
         }
 
         return cur;
     }
 
-    ggml_tensor * build_attn(
-            ggml_tensor * wo,
-            ggml_tensor * wo_b,
-            ggml_tensor * q_cur,
-            ggml_tensor * k_cur,
-            ggml_tensor * v_cur,
-            ggml_tensor * kq_mask,
-            float kq_scale,
-            int il) const {
+    ggml_tensor * build_attn(ggml_tensor * wo,
+                             ggml_tensor * wo_b,
+                             ggml_tensor * attn_ln_w,
+                             ggml_tensor * attn_ln_b,
+                             ggml_tensor * q_cur,
+                             ggml_tensor * k_cur,
+                             ggml_tensor * v_cur,
+                             ggml_tensor * kq_mask,
+                             float         kq_scale,
+                             int           il) const {
         // these nodes are added to the graph together so that they are not reordered
         // by doing so, the number of splits in the graph is reduced
         ggml_build_forward_expand(gf, q_cur);
@@ -1965,6 +2140,21 @@ struct clip_graph {
 
         cb(cur, "kqv_out", il);
 
+        // inner attention LayerNorm (JinaCLIP specific)
+        if (attn_ln_w && attn_ln_b) {
+            ggml_tensor * attn_ln_w_f32 = attn_ln_w;
+            ggml_tensor * attn_ln_b_f32 = attn_ln_b;
+
+            if (attn_ln_w->type == GGML_TYPE_F16) {
+                attn_ln_w_f32 = ggml_cast(ctx0, attn_ln_w, GGML_TYPE_F32);
+            }
+            if (attn_ln_b->type == GGML_TYPE_F16) {
+                attn_ln_b_f32 = ggml_cast(ctx0, attn_ln_b, GGML_TYPE_F32);
+            }
+            // use model epsilon from metadata for consistency
+            cur = build_norm(cur, attn_ln_w_f32, attn_ln_b_f32, NORM_TYPE_NORMAL, hparams.eps, il);
+        }
+
         if (wo) {
             cur = ggml_mul_mat(ctx0, wo, cur);
         }
@@ -1976,6 +2166,92 @@ struct clip_graph {
         return cur;
     }
 
+
+    // JinaCLIP RoPE: implement VisionRotaryEmbeddingFast via ggml_rope_ext with constant frequency scaling s = pt/ft
+    ggml_tensor * build_jinaclip_rope(ggml_context * ctx0,
+                                      clip_ctx *     /*ctx*/,
+                                      ggml_tensor *  cur,
+                                      const int      pt_seq_len,
+                                      const int      ft_seq_len,
+                                      const float    freq_base,
+                                      const bool     has_cls_token = true,
+                                      const bool     /*if_query*/      = true,
+                                      const int      /*layer_id*/      = -1) {
+        (void) pt_seq_len;
+        (void) ft_seq_len;
+        const int64_t n_dim  = cur->ne[0];
+        const int64_t n_pos_patches  = cur->ne[1]; // seq length (patches-only)
+        const int64_t n_head = cur->ne[2];
+
+        GGML_ASSERT(n_dim % 2 == 0);
+        const int64_t half = n_dim/2;
+
+        // constant frequency scaling (VisionRotaryEmbeddingFast): s = pt/ft
+        // use c-tensor path by default; freq_scale fixed to 1.0
+
+        ggml_tensor * pos_h_full = ggml_graph_get_tensor(gf, "pos_h");
+        ggml_tensor * pos_w_full = ggml_graph_get_tensor(gf, "pos_w");
+        GGML_ASSERT(pos_h_full && pos_w_full);
+
+        const int64_t offset = has_cls_token ? 1 : 0;
+        ggml_tensor * pos_h = ggml_view_1d(ctx0, pos_h_full, n_pos_patches, offset * (int64_t)ggml_element_size(pos_h_full));
+        ggml_tensor * pos_w = ggml_view_1d(ctx0, pos_w_full, n_pos_patches, offset * (int64_t)ggml_element_size(pos_w_full));
+        // fixed H/W selection; remove debug env toggles
+        ggml_tensor * pos_a = pos_h;
+        ggml_tensor * pos_b = pos_w;
+
+
+        ggml_tensor * first = ggml_view_3d(ctx0, cur,
+            half, n_head, n_pos_patches,
+            /*nb1 for head*/ cur->nb[2],
+            /*nb2 for seq */ cur->nb[1],
+            0);
+        ggml_tensor * c_first = ggml_graph_get_tensor(gf, "rope_c_first");
+        ggml_tensor * c_second = ggml_graph_get_tensor(gf, "rope_c_second");
+        GGML_ASSERT(c_first && c_second);
+
+        ggml_tensor * first_rot = ggml_rope_ext(
+            ctx0,
+            first,
+            pos_a,
+            c_first,
+            half,
+            0,
+            0,
+            freq_base,
+            1.0f,
+            0.0f, 1.0f, 0.0f, 0.0f);
+        first = ggml_view_3d(ctx0, first_rot,
+            half, n_pos_patches, n_head,
+            /*nb1 for seq */ first_rot->nb[2],
+            /*nb2 for head*/ first_rot->nb[1],
+            0);
+
+        ggml_tensor * second_hs = ggml_view_3d(ctx0, cur,
+            half, n_head, n_pos_patches,
+            /*nb1 for head*/ cur->nb[2],
+            /*nb2 for seq */ cur->nb[1],
+            /*offset*/ half * ggml_element_size(cur));
+        ggml_tensor * second_rot = ggml_rope_ext(
+            ctx0,
+            second_hs,
+            pos_b,
+            c_second,
+            half,
+            0,
+            0,
+            freq_base,
+            1.0f,
+            0.0f, 1.0f, 0.0f, 0.0f);
+        ggml_tensor * second = ggml_view_3d(ctx0, second_rot,
+            half, n_pos_patches, n_head,
+            /*nb1 for seq */ second_rot->nb[2],
+            /*nb2 for head*/ second_rot->nb[1],
+            0);
+        ggml_tensor * result = ggml_concat(ctx0, first, second, 0);
+        return result;
+    }
+
     // implementation of the 2D RoPE without adding a new op in ggml
     // this is not efficient (use double the memory), but works on all backends
     // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
@@ -2121,6 +2397,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 res = graph.build_whisper_enc();
             } break;
+        case PROJECTOR_TYPE_JINACLIP2:
+            {
+                res = graph.build_jina();
+            }
+            break;
         case PROJECTOR_TYPE_KIMIVL:
             {
                 res = graph.build_kimivl();
@@ -2218,7 +2499,7 @@ struct clip_model_loader {
         model.modality = modality;
 
 
-        // projector type
+        // projector type & optional version
         std::string proj_type;
         {
             // default key
@@ -2247,6 +2528,8 @@ struct clip_model_loader {
                                     ? PROJECTOR_TYPE_QWEN25VL
                                     : PROJECTOR_TYPE_QWEN2A;
             }
+
+            // JinaCLIP versions are distinguished by projector_type (e.g., jinaclip2)
         }
 
         const bool is_vision = model.modality == CLIP_MODALITY_VISION;
@@ -2427,6 +2710,13 @@ struct clip_model_loader {
                         get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
                         set_llava_uhd_res_candidates(model, 3);
                     } break;
+                case PROJECTOR_TYPE_JINACLIP2:
+                    {
+                        // default RoPE base for vision; prefer GGUF override if present
+                        hparams.rope_theta = 10000.0f;
+                        get_f32(KEY_VISION_ROPE_THETA, hparams.rope_theta, /*required=*/false);
+                    }
+                    break;
                 case PROJECTOR_TYPE_ULTRAVOX:
                 case PROJECTOR_TYPE_QWEN2A:
                 case PROJECTOR_TYPE_VOXTRAL:
@@ -2545,6 +2835,12 @@ struct clip_model_loader {
             layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      prefix, il, "bias"), false);
             layer.v_b    = get_tensor(string_format(TN_ATTN_V,      prefix, il, "bias"), false);
             layer.o_b    = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
+            // JinaCLIP-specific inner attention LayerNorm is optional for other projectors
+            {
+                const bool need_attn_ln = model.proj_type == PROJECTOR_TYPE_JINACLIP2;
+                layer.attn_ln_w = get_tensor(string_format(TN_ATTN_LN, prefix, il, "weight"), need_attn_ln);
+                layer.attn_ln_b = get_tensor(string_format(TN_ATTN_LN, prefix, il, "bias"), need_attn_ln);
+            }
             layer.ln_1_b = get_tensor(string_format(TN_LN_1,        prefix, il, "bias"), false);
             layer.ln_2_b = get_tensor(string_format(TN_LN_2,        prefix, il, "bias"), false);
 
@@ -2555,22 +2851,20 @@ struct clip_model_loader {
             layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"),   false);
             layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
             layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"),   false);
+            layer.ffn_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"), false);
+            layer.ffn_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"), false);
 
             // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
             // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
-            bool is_ffn_swapped = (
+            bool is_ffn_swapped =
+                (
                     // only old models need this fix
-                    model.proj_type == PROJECTOR_TYPE_MLP
-                    || model.proj_type == PROJECTOR_TYPE_MLP_NORM
-                    || model.proj_type == PROJECTOR_TYPE_LDP
-                    || model.proj_type == PROJECTOR_TYPE_LDPV2
-                    || model.proj_type == PROJECTOR_TYPE_QWEN2VL
-                    || model.proj_type == PROJECTOR_TYPE_QWEN25VL
-                    || model.proj_type == PROJECTOR_TYPE_GLM_EDGE
-                    || model.proj_type == PROJECTOR_TYPE_GEMMA3
-                    || model.proj_type == PROJECTOR_TYPE_IDEFICS3
-                    || model.proj_type == PROJECTOR_TYPE_MINICPMV
-                ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
+                    model.proj_type == PROJECTOR_TYPE_MLP || model.proj_type == PROJECTOR_TYPE_MLP_NORM ||
+                    model.proj_type == PROJECTOR_TYPE_LDP || model.proj_type == PROJECTOR_TYPE_LDPV2 ||
+                    model.proj_type == PROJECTOR_TYPE_QWEN2VL || model.proj_type == PROJECTOR_TYPE_QWEN25VL ||
+                    model.proj_type == PROJECTOR_TYPE_GLM_EDGE || model.proj_type == PROJECTOR_TYPE_GEMMA3 ||
+                    model.proj_type == PROJECTOR_TYPE_IDEFICS3 || model.proj_type == PROJECTOR_TYPE_MINICPMV) &&
+                layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd;
             if (is_ffn_swapped) {
                 // swap up and down weights
                 ggml_tensor * tmp = layer.ff_up_w;
@@ -2766,6 +3060,13 @@ struct clip_model_loader {
                     model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
                     model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
                 } break;
+            case PROJECTOR_TYPE_JINACLIP2:
+                {
+                    // JinaCLIP is a pure vision encoder without separate projection layers
+                    // It only uses patch embedding projections
+                    // No additional mm projection tensors are loaded for JinaCLIP
+                }
+                break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
         }
@@ -3646,9 +3947,44 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         res_imgs->grid_y = inst.grid_size.height;
         return true;
 
-    } else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2
-             || ctx->proj_type() == PROJECTOR_TYPE_KIMIVL
-    ) {
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_JINACLIP2) {
+        clip_image_u8 processed_image;
+        int           sz = params.image_size;  // e.g., 512 for JinaCLIP
+
+        // 1) Preserve aspect ratio: resize so that the shorter side == sz (bicubic)
+        int in_w = img->nx;
+        int in_h = img->ny;
+        int out_w, out_h;
+        if (in_w <= 0 || in_h <= 0) {
+            LOG_ERR("%s: invalid input image size %dx%d\n", __func__, in_w, in_h);
+            return false;
+        }
+        if (in_w < in_h) {
+            out_w = sz;
+            out_h = std::max(1, (int) std::round((double) in_h * sz / in_w));
+        } else {
+            out_h = sz;
+            out_w = std::max(1, (int) std::round((double) in_w * sz / in_h));
+        }
+
+        clip_image_u8 resized_keep_ratio;
+        image_manipulation::bicubic_resize(*img, resized_keep_ratio, out_w, out_h);
+
+        // 2) Center-crop to sz x sz
+        int x0     = std::max(0, (resized_keep_ratio.nx - sz) / 2);
+        int y0     = std::max(0, (resized_keep_ratio.ny - sz) / 2);
+        int crop_w = std::min(sz, resized_keep_ratio.nx);
+        int crop_h = std::min(sz, resized_keep_ratio.ny);
+
+        image_manipulation::crop_image(resized_keep_ratio, processed_image, x0, y0, crop_w, crop_h);
+
+        // 3) Normalize
+        clip_image_f32_ptr img_f32(clip_image_f32_init());
+        normalize_image_u8_to_f32(processed_image, *img_f32, params.image_mean, params.image_std);
+        res_imgs->entries.push_back(std::move(img_f32));
+        return true;
+
+    } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2 || ctx->proj_type() == PROJECTOR_TYPE_KIMIVL) {
         GGML_ASSERT(params.proj_scale_factor);
 
         // smart resize
@@ -3803,6 +4139,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
             {
                 // do nothing
             } break;
+        case PROJECTOR_TYPE_JINACLIP2:
+            {
+                n_patches = 1;
+            }
+            break;
         case PROJECTOR_TYPE_LDP:
         case PROJECTOR_TYPE_LDPV2:
         case PROJECTOR_TYPE_GLM_EDGE:
@@ -4271,6 +4612,60 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             }
             set_input_i32("positions", positions);
         } break;
+        case PROJECTOR_TYPE_JINACLIP2:
+            {
+                // learned pos emb (CLS + patches)
+                std::vector<int32_t> positions(n_pos);
+                for (int i = 0; i < n_pos; i++) {
+                    positions[i] = i;  // 0, 1, 2, ... (CLS=0, patches=1+)
+                }
+                set_input_i32("positions", positions);
+
+                // 2D RoPE positions (H/W), CLS kept at 0
+                const int n_patches = model.class_embedding ? (n_pos - 1) : n_pos;
+                const int n_patches_per_col = image_size_width / patch_size;
+                std::vector<int32_t> pos_data(n_pos, 0);
+                // H dimension
+                for (int i = 0; i < n_patches; ++i) {
+                    int idx = model.class_embedding ? (i + 1) : i;
+                    pos_data[idx] = i / n_patches_per_col;
+                }
+                set_input_i32("pos_h", pos_data);
+                // W dimension
+                std::fill(pos_data.begin(), pos_data.end(), 0);
+                for (int i = 0; i < n_patches; ++i) {
+                    int idx = model.class_embedding ? (i + 1) : i;
+                    pos_data[idx] = i % n_patches_per_col;
+                }
+                set_input_i32("pos_w", pos_data);
+
+                // fill rope_c_first/second (c-tensor default): c_first = 1/s, c_second = 1/(s*odd)
+                int pt_seq_len = 16;
+                if (patch_size > 0) {
+                    int cand = (int) llroundf(224.0f / (float) patch_size);
+                    if (cand > 0) pt_seq_len = cand;
+                }
+                float s = (float) pt_seq_len / (float) n_patches_per_col;
+                int d_head_local = hparams.n_embd / hparams.n_head;
+                int half_local = d_head_local/2;
+                std::vector<float> rope_c_first(half_local);
+                std::vector<float> rope_c_second(half_local);
+                float odd = std::pow(hparams.rope_theta, (float)-2.0f / (float)d_head_local);
+                for (int k = 0; k < half_local; ++k) {
+                    rope_c_first[k]  = 1.0f / s;
+                    rope_c_second[k] = 1.0f / (s * odd);
+                }
+                // set inputs
+                auto t1 = ggml_graph_get_tensor(gf, "rope_c_first");
+                auto t2 = ggml_graph_get_tensor(gf, "rope_c_second");
+                GGML_ASSERT(t1 && (t1->flags & GGML_TENSOR_FLAG_INPUT));
+                GGML_ASSERT(t2 && (t2->flags & GGML_TENSOR_FLAG_INPUT));
+                ggml_backend_tensor_set(t1, rope_c_first.data(), 0, ggml_nbytes(t1));
+                ggml_backend_tensor_set(t2, rope_c_second.data(), 0, ggml_nbytes(t2));
+
+        // unified ggml_rope_ext path; no precomputed cos/sin tables
+            }
+            break;
         case PROJECTOR_TYPE_MLP:
         case PROJECTOR_TYPE_MLP_NORM:
         case PROJECTOR_TYPE_LDP:
@@ -4340,17 +4735,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         return false;
     }
 
-    // print debug nodes
-    if (ctx->debug_graph) {
-        LOG_INF("\n\n---\n\n");
-        LOG_INF("\n\nDebug graph:\n\n");
-        for (ggml_tensor * t : ctx->debug_print_tensors) {
-            std::vector<uint8_t> data(ggml_nbytes(t));
-            ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
-            print_tensor_shape(t);
-            print_tensor_data(t, data.data(), 3);
-        }
-    }
+    // debug printing and env-based dumps removed
 
     // the last node is the embedding tensor
     ggml_tensor * embeddings = ggml_graph_node(gf, -1);
@@ -4378,6 +4763,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_MLP:
         case PROJECTOR_TYPE_PIXTRAL:
             return ctx->model.mm_2_w->ne[1];
+        case PROJECTOR_TYPE_JINACLIP2:
+            return ctx->model.hparams.projection_dim;
         case PROJECTOR_TYPE_MLP_NORM:
             return ctx->model.mm_3_b->ne[0];
         case PROJECTOR_TYPE_MINICPMV:
@@ -4432,6 +4819,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
     return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
 }
 
+bool clip_is_jinaclip2(const struct clip_ctx * ctx) {
+    return ctx->proj_type() == PROJECTOR_TYPE_JINACLIP2;
+}
+
 bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
     return ctx->model.modality == CLIP_MODALITY_VISION;
 }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h
index 3387cdbd36955..65f9b4185fc46 100644
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -95,6 +95,7 @@ bool clip_is_glm(const struct clip_ctx * ctx);
 bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 bool clip_is_llava(const struct clip_ctx * ctx);
 bool clip_is_gemma3(const struct clip_ctx * ctx);
+bool clip_is_jinaclip2(const struct clip_ctx * ctx);
 
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 5fde6ca0c32ae..556d617abe1e8 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -39,7 +39,13 @@ static void show_additional_info(int /*argc*/, char ** argv) {
     LOG(
         "Experimental CLI for multimodal\n\n"
         "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> --audio <audio> -p <prompt>\n\n"
-        "  -m and --mmproj are required\n"
+        "  -m and --mmproj are required in chat/generation modes\n"
+        "  Special case: when only --mmproj and --image are provided and projector is JinaCLIP,\n"
+        "  run the projector-only embedding path (no -m needed)\n"
+        "  Embedding output options (projector-only):\n"
+        "    --embd-output-format {array|json|json+|''}  print embedding to stdout; empty to disable\n"
+        "    --embd-truncate <n>                         truncate to first n dims (default 512)\n"
+        "    --embd-normalize {0|1|2}                    normalize: 0=none, 1=L1, 2=L2 (default 2)\n"
         "  -hf user/repo can replace both -m and --mmproj in most cases\n"
         "  --image, --audio and -p are optional, if NOT provided, the CLI will run in chat mode\n"
         "  to disable using GPU for mmproj model, add --no-mmproj-offload\n",
@@ -160,6 +166,131 @@ struct mtmd_cli_context {
     }
 };
 
+// embedding output controls (projector-only path)
+struct embd_out_opts {
+    std::string format;     // "array" | "json" | "json+" | ""
+    int         normalize = 2; // 0: none, 1: L1, 2: L2
+    int         truncate  = 512; // <=0 means no truncation
+};
+
+// parse embedding-related options and return filtered argv for common_params_parse
+static std::vector<char*> parse_and_filter_args(int argc, char ** argv, embd_out_opts & opts) {
+    std::vector<char*> filtered;
+    filtered.push_back(argv[0]);
+    for (int i = 1; i < argc; ++i) {
+        std::string a = argv[i];
+        auto consume_value = [&](std::string & dst){
+            if (i + 1 < argc) { dst = argv[++i]; return true; } return false; };
+        if (a == "--embd-output-format") { consume_value(opts.format); continue; }
+        if (a == "--embd-normalize") {
+            std::string v; if (consume_value(v)) opts.normalize = std::stoi(v); continue;
+        }
+        if (a == "--embd-truncate") {
+            std::string v; if (consume_value(v)) opts.truncate = std::stoi(v); continue;
+        }
+        // keep others
+        filtered.push_back(argv[i]);
+    }
+    return filtered;
+}
+
+static int run_mmproj_only(common_params & params, const embd_out_opts & eopts) {
+    if (params.mmproj.path.empty() || params.image.empty()) return -1;
+    mtmd_context_params ctx_params = mtmd_context_params_default();
+    ctx_params.use_gpu   = params.mmproj_use_gpu;
+    ctx_params.verbosity = (params.verbosity > 0) ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO;
+    mtmd_mmproj_context * mctx = mtmd_mmproj_init(params.mmproj.path.c_str(), ctx_params);
+    if (!mctx) {
+        LOG_ERR("[ERROR] Failed to load vision mmproj: %s\n", params.mmproj.path.c_str());
+        return 1;
+    }
+    if (!mtmd_mmproj_is_supported(mctx)) {
+        mtmd_mmproj_free(mctx);
+        return -1; // not JinaCLIP, not applicable
+    }
+    const bool silent_json = !eopts.format.empty();
+    if (!silent_json) {
+        LOG("VISION(projector-only auto): image_size=%d patch=%d hidden=%d\n", mtmd_mmproj_get_image_size(mctx), mtmd_mmproj_get_patch_size(mctx), mtmd_mmproj_get_hidden_size(mctx));
+    }
+
+    bool printed_any = false;
+    if (eopts.format == "array" || eopts.format == "json" || eopts.format == "json+") {
+        if (eopts.format == "array") {
+            LOG("[");
+        } else {
+            LOG("{\n  \"object\": \"list\",\n  \"data\": [\n");
+        }
+    }
+    for (size_t i = 0; i < params.image.size(); ++i) {
+        const char * image_path = params.image[i].c_str();
+        mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file_noctx(image_path));
+        if (!bmp.ptr) { LOG_ERR("[ERROR] Failed to load image %s\n", image_path); continue; }
+        float * emb = nullptr; size_t n_el = 0;
+        auto enc_start = std::chrono::high_resolution_clock::now();
+        int enc_rc = mtmd_mmproj_encode_bitmap(mctx, bmp.ptr.get(), params.cpuparams.n_threads, &emb, &n_el);
+        auto enc_end = std::chrono::high_resolution_clock::now();
+        auto enc_ms = std::chrono::duration_cast<std::chrono::microseconds>(enc_end - enc_start).count() / 1000.0;
+        if (enc_rc != 0) {
+            LOG_ERR("[ERROR] Image encoding failed: %s\n", image_path);
+            continue;
+        }
+        std::vector<float> image_embd(emb, emb + n_el);
+        std::free(emb);
+        if (!silent_json) {
+            LOG("IMAGE %zu/%zu: %s encode_ms=%.3f out_dim=%zu\n", i+1, params.image.size(), image_path, enc_ms, image_embd.size());
+        }
+        // truncation & normalization
+        if (eopts.truncate > 0 && image_embd.size() > (size_t)eopts.truncate) image_embd.resize(eopts.truncate);
+        if (eopts.normalize != 0) {
+            common_embd_normalize(image_embd.data(), image_embd.data(), (int) image_embd.size(), /*embd_norm=*/eopts.normalize);
+        }
+
+        // file output removed: always print to stdout only
+
+        // stdout print (array/json/json+)
+        if (eopts.format == "array") {
+            if (printed_any) LOG(",");
+            LOG("[");
+            for (size_t k = 0; k < image_embd.size(); ++k) {
+                if (k) LOG(",");
+                LOG("%.7f", image_embd[k]);
+            }
+            LOG("]");
+            printed_any = true;
+        } else if (eopts.format == "json" || eopts.format == "json+") {
+            if (printed_any) LOG(",\n");
+            LOG("    {\n      \"object\": \"embedding\",\n      \"index\": %zu,\n      \"embedding\": ", i);
+            LOG("[");
+            for (size_t k = 0; k < image_embd.size(); ++k) {
+                if (k) LOG(",");
+                LOG("%.7f", image_embd[k]);
+            }
+            LOG("]\n    }");
+            printed_any = true;
+        }
+    }
+    if (eopts.format == "array") {
+        LOG("]\n");
+    } else if (eopts.format == "json" || eopts.format == "json+") {
+        if (eopts.format == "json+" && params.image.size() > 1) {
+            // cosineSimilarity only in json+ and multi-image mode (to match llama-embedding style)
+            LOG(",\n  \"cosineSimilarity\": [\n");
+            for (size_t i = 0; i < params.image.size(); ++i) {
+                LOG("    [");
+                for (size_t j = 0; j < params.image.size(); ++j) {
+                    LOG("%6.2f", 0.0f);
+                    if (j + 1 < params.image.size()) LOG(", ");
+                }
+                LOG(" ]%s\n", (i + 1 < params.image.size() ? "," : ""));
+            }
+            LOG("  ]\n");
+        }
+        LOG("\n}\n");
+    }
+    mtmd_mmproj_free(mctx);
+    return 0;
+}
+
 static int generate_response(mtmd_cli_context & ctx, int n_predict) {
     llama_tokens generated_tokens;
     for (int i = 0; i < n_predict; i++) {
@@ -249,12 +380,24 @@ int main(int argc, char ** argv) {
     ggml_time_init();
 
     common_params params;
+    embd_out_opts eopts;
+
+    // parse and strip custom embedding options first
+    std::vector<char*> argv_filtered = parse_and_filter_args(argc, argv, eopts);
+    int argc_filtered = (int) argv_filtered.size();
+    char ** argv_f = argv_filtered.data();
     params.sampling.temp = 0.2; // lower temp by default for better quality
 
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
+    if (!common_params_parse(argc_filtered, argv_f, params, LLAMA_EXAMPLE_MTMD, show_additional_info)) {
         return 1;
     }
 
+    // try projector-only path for JinaCLIP first (otherwise return -1 and continue normal flow)
+    {
+        int rc = run_mmproj_only(params, eopts);
+        if (rc >= 0) return rc; // 0 success; 1 failure; -1 not JinaCLIP
+    }
+
     common_init();
 
     if (params.mmproj.path.empty()) {
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp
index 686f42f3960fe..5602405c9f5f8 100644
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -458,3 +458,17 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
 
     return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
 }
+
+// image-only helper, no mtmd_context required
+// decode supported image file formats and construct mtmd_bitmap
+mtmd_bitmap * mtmd_helper_bitmap_init_from_file_noctx(const char * fname) {
+    int nx = 0, ny = 0, nc = 0;
+    unsigned char * data = stbi_load(fname, &nx, &ny, &nc, 3);
+    if (!data) {
+        LOG_ERR("%s: failed to decode image file %s\n", __func__, fname);
+        return nullptr;
+    }
+    mtmd_bitmap * result = mtmd_bitmap_init(nx, ny, data);
+    stbi_image_free(data);
+    return result;
+}
diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h
index 5c0edc6937eee..b6b48e596be64 100644
--- a/tools/mtmd/mtmd-helper.h
+++ b/tools/mtmd/mtmd-helper.h
@@ -35,6 +35,11 @@ MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, con
 // this function is thread-safe
 MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
 
+// image-only helper: decode an image file without requiring an mtmd_context
+// supported formats follow stb_image (jpg, png, bmp, gif, etc.)
+// returns nullptr on failure
+MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file_noctx(const char * fname);
+
 // helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 4d487581ae0a0..27d93fd449b48 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1,6 +1,7 @@
 #include "clip.h"
 #include "clip-impl.h"
 #include "mtmd.h"
+#include "mtmd-helper.h"
 #include "mtmd-audio.h"
 
 #include "llama.h"
@@ -369,6 +370,100 @@ void mtmd_free(mtmd_context * ctx) {
     }
 }
 
+// ------------------------------
+// Projector-only (mmproj-only) utilities
+// ------------------------------
+
+struct mtmd_mmproj_context {
+    clip_ctx * ctx_v = nullptr;
+};
+
+mtmd_mmproj_context * mtmd_mmproj_init(const char * mmproj_fname,
+                                        const struct mtmd_context_params ctx_params) {
+    clip_context_params clip_params;
+    clip_params.use_gpu   = ctx_params.use_gpu;
+    clip_params.verbosity = ctx_params.verbosity;
+    auto res = clip_init(mmproj_fname, clip_params);
+    if (!res.ctx_v) {
+        return nullptr;
+    }
+    auto * ctx = new mtmd_mmproj_context();
+    ctx->ctx_v = res.ctx_v;
+    return ctx;
+}
+
+void mtmd_mmproj_free(struct mtmd_mmproj_context * ctx) {
+    if (!ctx) return;
+    clip_free(ctx->ctx_v);
+    delete ctx;
+}
+
+int mtmd_mmproj_get_image_size(struct mtmd_mmproj_context * ctx) {
+    return ctx && ctx->ctx_v ? clip_get_image_size(ctx->ctx_v) : -1;
+}
+
+int mtmd_mmproj_get_patch_size(struct mtmd_mmproj_context * ctx) {
+    return ctx && ctx->ctx_v ? clip_get_patch_size(ctx->ctx_v) : -1;
+}
+
+int mtmd_mmproj_get_hidden_size(struct mtmd_mmproj_context * ctx) {
+    return ctx && ctx->ctx_v ? clip_get_hidden_size(ctx->ctx_v) : -1;
+}
+
+bool mtmd_mmproj_is_jinaclip(struct mtmd_mmproj_context * ctx) {
+    return ctx && ctx->ctx_v ? clip_is_jinaclip2(ctx->ctx_v) : false;
+}
+
+bool mtmd_mmproj_is_supported(struct mtmd_mmproj_context * ctx) {
+    if (!ctx || !ctx->ctx_v) return false;
+    projector_type proj = clip_get_projector_type(ctx->ctx_v);
+    // extendable: list of projectors supported by this mmproj-only path
+    switch (proj) {
+        case PROJECTOR_TYPE_JINACLIP2: return true;
+        default: return false;
+    }
+}
+
+int mtmd_mmproj_encode_bitmap(struct mtmd_mmproj_context * ctx,
+                              const mtmd_bitmap * bmp,
+                              int n_threads,
+                              float ** out_data,
+                              size_t * out_count) {
+    if (!ctx || !ctx->ctx_v || !bmp || !out_data || !out_count) return 1;
+    // convert mtmd_bitmap to clip_image_u8
+    clip_image_u8_ptr img_u8(clip_image_u8_init());
+    img_u8->nx = bmp->nx;
+    img_u8->ny = bmp->ny;
+    img_u8->buf.resize(bmp->data.size());
+    std::memcpy(img_u8->buf.data(), bmp->data.data(), img_u8->nx * img_u8->ny * 3);
+
+    // preprocess
+    clip_image_f32_batch batch_f32;
+    bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
+    if (!ok) {
+        return 2;
+    }
+    clip_image_f32 * processed_img = clip_image_f32_get_img(&batch_f32, 0);
+    if (!processed_img) {
+        return 3;
+    }
+
+    const int n_tok   = clip_n_output_tokens(ctx->ctx_v, processed_img);
+    const int n_embd  = clip_n_mmproj_embd(ctx->ctx_v);
+    const size_t n_el = (size_t) n_tok * (size_t) n_embd;
+    std::vector<float> buf(n_el);
+    if (!clip_image_encode(ctx->ctx_v, n_threads, processed_img, buf.data())) {
+        return 4;
+    }
+
+    float * out = (float *) std::malloc(n_el * sizeof(float));
+    if (!out) return 5;
+    std::memcpy(out, buf.data(), n_el * sizeof(float));
+    *out_data  = out;
+    *out_count = n_el;
+    return 0;
+}
+
 struct mtmd_tokenizer {
     mtmd_context * ctx;
     std::vector<const mtmd_bitmap *> bitmaps;
diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h
index f4ea07d3ad521..c8a7a69ecfa08 100644
--- a/tools/mtmd/mtmd.h
+++ b/tools/mtmd/mtmd.h
@@ -210,6 +210,36 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx,
 // llama_model_n_embd(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float)
 MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 
+//
+// Projector-only utilities (mmproj-only, no text model required)
+//
+
+struct mtmd_mmproj_context;
+
+// initialize a minimal context that only loads the projector (vision) from a GGUF file
+// returns nullptr on failure
+MTMD_API struct mtmd_mmproj_context * mtmd_mmproj_init(const char * mmproj_fname,
+                                                       const struct mtmd_context_params ctx_params);
+
+// free projector-only context
+MTMD_API void mtmd_mmproj_free(struct mtmd_mmproj_context * ctx);
+
+// basic queries
+MTMD_API int  mtmd_mmproj_get_image_size (struct mtmd_mmproj_context * ctx);
+MTMD_API int  mtmd_mmproj_get_patch_size (struct mtmd_mmproj_context * ctx);
+MTMD_API int  mtmd_mmproj_get_hidden_size(struct mtmd_mmproj_context * ctx);
+MTMD_API bool mtmd_mmproj_is_jinaclip    (struct mtmd_mmproj_context * ctx);
+// generic support check for projector-only encode path
+MTMD_API bool mtmd_mmproj_is_supported   (struct mtmd_mmproj_context * ctx);
+
+// encode a bitmap (RGB) to projector embeddings; on success, *out_data points to malloc()'d buffer of size (*out_count)
+// caller must free(*out_data)
+MTMD_API int mtmd_mmproj_encode_bitmap(struct mtmd_mmproj_context * ctx,
+                                       const mtmd_bitmap * bmp,
+                                       int n_threads,
+                                       float ** out_data,
+                                       size_t * out_count);
+
 /////////////////////////////////////////
 
 // test function, to be used in test-mtmd-c-api.c