From 2787888e4d6fde368affa8fe5647dbe2427f9be3 Mon Sep 17 00:00:00 2001 From: liyang Date: Wed, 22 Oct 2025 16:38:24 +0800 Subject: [PATCH] mtmd/clip+converter: address #16574; fold CLI into mtmd-cli, fix converter keys, rope_ext + bicubic mtmd-cli: move the standalone Jina CLI into mtmd-cli (projector-only path); drop the extra binary. --- convert_hf_to_gguf.py | 325 +++++++++++++++++++++- gguf-py/gguf/constants.py | 1 + tools/mtmd/clip-impl.h | 47 ++-- tools/mtmd/clip.cpp | 541 ++++++++++++++++++++++++++++++++----- tools/mtmd/clip.h | 1 + tools/mtmd/mtmd-cli.cpp | 147 +++++++++- tools/mtmd/mtmd-helper.cpp | 14 + tools/mtmd/mtmd-helper.h | 5 + tools/mtmd/mtmd.cpp | 95 +++++++ tools/mtmd/mtmd.h | 30 ++ 10 files changed, 1097 insertions(+), 109 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ed99dc8477231..0ebe5eea99e11 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5026,7 +5026,18 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, if lora_names := hparams.get("lora_adaptations"): self._lora_names = lora_names - self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 + + try: + text_cfg = hparams.get("text_config", {}) if isinstance(hparams.get("text_config", {}), dict) else {} + pe_type = (text_cfg.get("position_embedding_type") or hparams.get("position_embedding_type") or "").lower() + rope_base = text_cfg.get("rotary_emb_base", hparams.get("rotary_emb_base")) + name_path = (hparams.get("_name_or_path") or "").lower() + is_vx = ("jina" in name_path and ("v2" in name_path or "v3" in name_path)) + is_v3 = (pe_type == "rotary" or rope_base is not None) and is_vx + if (is_v3) or self._lora_names: + self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 + except Exception: + pass super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) self._xlmroberta_tokenizer_init() @@ -6248,6 +6259,293 @@ def set_vocab(self): raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel') +@ModelBase.register("JinaCLIPVisionModel", "JinaCLIPModel") +class JinaCLIPVisionModel(MmprojModel): + """JinaCLIP v2 Vision Encoder Model - handles vision component only""" + model_arch = gguf.MODEL_ARCH.MMPROJ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Load config for vision encoder + config_path = self.dir_model / "config.json" + if config_path.exists(): + with open(config_path, encoding="utf-8") as f: + self.vision_config = json.load(f) + else: + # Default JinaCLIP v2 vision configuration + self.vision_config = { + "image_size": 448, + "patch_size": 14, + "hidden_size": 1024, + "num_hidden_layers": 24, + "num_attention_heads": 16, + "intermediate_size": 2731, + "layer_norm_eps": 1e-5, + "projection_dim": 1024 + } + + def set_vocab(self): + # Vision encoder doesn't need vocabulary + pass + + def set_gguf_parameters(self): + # Identification (arch/name is set by writer); mark vision encoder presence + self.gguf_writer.add_clip_has_vision_encoder(True) + + # Vision parameters + config = self.vision_config + img_sz = int(config.get("image_size", 448)) + patch_sz = int(config.get("patch_size", 14)) + n_embd = int(config.get("hidden_size", 1024)) + n_layer = int(config.get("num_hidden_layers", 24)) + n_head = int(config.get("num_attention_heads", 16)) + n_ff = int(config.get("intermediate_size", 2731)) + proj_dim = int(config.get("projection_dim", 1024)) + + # Use gguf writer helpers (constants + typed setters) + self.gguf_writer.add_vision_image_size(img_sz) + self.gguf_writer.add_vision_patch_size(patch_sz) + self.gguf_writer.add_vision_embedding_length(n_embd) + self.gguf_writer.add_vision_block_count(n_layer) + self.gguf_writer.add_vision_projection_dim(proj_dim) + self.gguf_writer.add_vision_feed_forward_length(n_ff) + self.gguf_writer.add_vision_head_count(n_head) + # LayerNorm epsilon comes from config (fallback 1e-5) + eps_attn = float(config.get("layer_norm_eps", 1e-5)) + self.gguf_writer.add_vision_attention_layernorm_eps(eps_attn) + + # Preprocessing defaults + self.gguf_writer.add_vision_image_mean([0.48145466, 0.4578275, 0.40821073]) + self.gguf_writer.add_vision_image_std ([0.26862954, 0.26130258, 0.27577711]) + + # Projector type and activation + # JinaCLIP v2 projector type string follows upstream style (family+major) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JINACLIP2) + self.gguf_writer.add_vision_use_silu(True) + + # RoPE parameter used by vision encoder (prefer config override) + try: + rt = config.get("rope_theta", None) + rope_theta = float(rt) if rt is not None else 10000.0 + except Exception: + rope_theta = 10000.0 + # writer currently has no dedicated setter for this key; keep direct write + self.gguf_writer.add_float32("clip.vision.rope_theta", rope_theta) + + # Compatibility (mmproj) — not covered by gguf writer helpers yet + self.gguf_writer.add_uint32("mmproj.embedding_length", n_embd) + self.gguf_writer.add_uint32("mmproj.block_count", n_layer) + + logger.info( + "mmproj(jinaclip): image_size=%d patch_size=%d n_embd=%d n_layer=%d n_head=%d n_ff=%d proj_dim=%d", + img_sz, patch_sz, n_embd, n_layer, n_head, n_ff, proj_dim + ) + + # helpers to keep modify_tensors compact and consistent with other models + def _strip_vm_prefix(self, name: str) -> str: + return name[len('vision_model.'):] if name.startswith('vision_model.') else name + + def _map_block_tensor(self, layer: int, rest: str, data_torch: Tensor, name: str) -> list[tuple[str, Tensor]] | None: + parts = rest.split('.') + # layer norms + if rest.startswith('norm1.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.ln_1.{suffix}', data_torch)] + if rest.startswith('norm2.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.ln_2.{suffix}', data_torch)] + if rest.startswith('attn.inner_attn_ln.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.attn_ln.{suffix}', data_torch)] + + # fused qkv + if rest == 'attn.qkv.weight': + w = data_torch + wdim = w.shape[0] + if wdim % 3 != 0: + logger.warning('mmproj(jinaclip): unexpected qkv weight shape %s for %s', tuple(w.shape), name) + d = wdim // 3 + q, k, v = w[0:d, :], w[d:2 * d, :], w[2 * d:, :] + return [ + (f'v.blk.{layer}.attn_q.weight', q), + (f'v.blk.{layer}.attn_k.weight', k), + (f'v.blk.{layer}.attn_v.weight', v), + ] + if rest == 'attn.qkv.bias': + b = data_torch + bdim = b.shape[0] + if bdim % 3 != 0: + logger.warning('mmproj(jinaclip): unexpected qkv bias shape %s for %s', tuple(b.shape), name) + d = bdim // 3 + qb, kb, vb = b[0:d], b[d:2 * d], b[2 * d:] + return [ + (f'v.blk.{layer}.attn_q.bias', qb), + (f'v.blk.{layer}.attn_k.bias', kb), + (f'v.blk.{layer}.attn_v.bias', vb), + ] + # separate q/v bias (some checkpoints) + if rest == 'attn.q_bias': + return [(f'v.blk.{layer}.attn_q.bias', data_torch)] + if rest == 'attn.v_bias': + return [(f'v.blk.{layer}.attn_v.bias', data_torch)] + + # separate projections + if rest.startswith('attn.q_proj.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.attn_q.{suffix}', data_torch)] + if rest.startswith('attn.k_proj.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.attn_k.{suffix}', data_torch)] + if rest.startswith('attn.v_proj.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.attn_v.{suffix}', data_torch)] + if rest.startswith('attn.proj.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.attn_out.{suffix}', data_torch)] + + # MLP + if rest.startswith('mlp.w1.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.ffn_gate.{suffix}', data_torch)] + if rest.startswith('mlp.w2.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)] + if rest.startswith('mlp.w3.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)] + if rest.startswith('mlp.ffn_ln.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.ffn_norm.{suffix}', data_torch)] + if rest.startswith('mlp.fc1.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.ffn_up.{suffix}', data_torch)] + if rest.startswith('mlp.fc2.'): + suffix = parts[-1] + return [(f'v.blk.{layer}.ffn_down.{suffix}', data_torch)] + return None + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + """Prefer base table-driven mapping; keep Jina-specific targets if already mapped; fallback to legacy mapper.""" + # Already a GGUF target name (e.g., "v.*" or "mm.*"): return as-is + if name.startswith('v.') or name.startswith('mm.'): + return name + # Try the base mapping first + try: + return super().map_tensor_name(name, try_suffixes=try_suffixes) + except Exception: + # Fallback to legacy Jina-specific mapper for any remaining edge keys + if hasattr(self, "_map_jinaclip_tensor_name"): + mapped = self._map_jinaclip_tensor_name(name) # type: ignore[attr-defined] + if mapped: + return mapped + return name + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + """Yield tensors for the vision encoder. + + Prefer the base implementation (supports sharded/indexed weights). If that fails + or no parts are detected, fall back to a direct single-file load. + """ + # Try base path (indexed/sharded) + try: + if getattr(self, "part_names", None): + for name, tensor in super().get_tensors(): + yield name, tensor + return + except Exception as e: + logger.warning("mmproj(jinaclip): base get_tensors failed, falling back to direct load: %s", e) + + # Fallback: direct single-file load + import torch + candidates = [ + self.dir_model / "pytorch_model.bin", + self.dir_model / "model.safetensors", + self.dir_model / "pytorch_model.safetensors", + self.dir_model / "vision_model_weights.bin", + ] + model_path = next((p for p in candidates if p.exists()), None) + if model_path is None: + raise FileNotFoundError(f"mmproj(jinaclip): no model weights found in {self.dir_model}") + + logger.info("mmproj(jinaclip): loading weights from %s", model_path) + if model_path.suffix == ".bin": + state_dict = torch.load(model_path, map_location="cpu", weights_only=True) + else: + from safetensors.torch import load_file + state_dict = load_file(str(model_path)) + + count = 0 + for name, tensor in state_dict.items(): + # yield raw names; modify_tensors will normalize & map + yield name, tensor + count += 1 + + logger.info("mmproj(jinaclip): yielded %d raw tensors", count) + + def _should_be_f32(self, gguf_name: str) -> bool: + """Return True if tensor should be stored as F32 to avoid type mismatches in C++ runtime. + + Keep the list minimal: LayerNorm weights/bias are the common source of + binary-op dtype issues; patch embedding bias is also safer as F32. + """ + patterns = ( + ".ln_1.weight", ".ln_1.bias", + ".ln_2.weight", ".ln_2.bias", + ".attn_ln.weight", ".attn_ln.bias", + ".ffn_norm.weight", ".ffn_norm.bias", + "v.patch_embd.proj.bias", + ) + return any(p in gguf_name for p in patterns) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + """Normalize JinaCLIP vision tensor names to base-friendly patterns, with Jina-specific exceptions. + + - Emit Jina-specific targets directly for: patch/proj, pos_embed, inner-attn LN, SwiGLU FFN names. + - If fused QKV is encountered, split into Q/K/V. + - For standard pieces (norm1/norm2, q/k/v/out), map to v.blk.{i}.* targets. + """ + del bid # unused + + src = name + if src.startswith('v.') or src.startswith('mm.'): + return [(src, data_torch)] + + # Drop 'vision_model.' prefix if present + src_no_vm = self._strip_vm_prefix(src) + + # Top-level direct mappings — use gguf constants directly for canonical names + if src_no_vm == 'cls_token': + base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_CLS] + return [(base, data_torch)] + if src_no_vm.startswith('patch_embed.proj.'): + suffix = src_no_vm.split('.')[-1] + base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + return [(f'{base}.{suffix}', data_torch)] + if src_no_vm == 'pos_embed': + pos_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_POS] + '.weight' + return [(pos_name, data_torch)] + if src_no_vm.startswith('norm.'): + suffix = src_no_vm.split('.')[-1] + base = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_POST_NORM] + return [(f'{base}.{suffix}', data_torch)] + + if src_no_vm.startswith('blocks.'): + parts = src_no_vm.split('.') + if len(parts) >= 3 and parts[1].isdigit(): + layer = int(parts[1]) + rest = '.'.join(parts[2:]) + mapped = self._map_block_tensor(layer, rest, data_torch, name) + if mapped is not None: + return mapped + + try: + return [(self.map_tensor_name(name), data_torch)] + except Exception: + logger.debug("mmproj(jinaclip): skip unmapped tensor %s", name) + return [] + + @ModelBase.register("OpenELMForCausalLM") class OpenELMModel(TextModel): model_arch = gguf.MODEL_ARCH.OPENELM @@ -9614,16 +9912,21 @@ def main() -> None: else: model_class = MistralModel - model_instance = model_class(dir_model, output_type, fname_out, - is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, - eager=args.no_lazy, - metadata_override=args.metadata, model_name=args.model_name, - split_max_tensors=args.split_max_tensors, - split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, - small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, - sentence_transformers_dense_modules=args.sentence_transformers_dense_modules - ) + common_kwargs = dict( + is_big_endian=args.bigendian, + use_temp_file=args.use_temp_file, + eager=args.no_lazy, + metadata_override=args.metadata, + model_name=args.model_name, + split_max_tensors=args.split_max_tensors, + split_max_size=split_str_to_n_bytes(args.split_max_size), + dry_run=args.dry_run, + small_first_shard=args.no_tensor_first_split, + remote_hf_model_id=hf_repo_id, + disable_mistral_community_chat_template=disable_mistral_community_chat_template, + sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, + ) + model_instance = model_class(dir_model, output_type, fname_out, **common_kwargs) if args.vocab_only: logger.info("Exporting model vocab...") diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1b71fb3749aaa..6a284a1cd8441 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -3057,6 +3057,7 @@ class VisionProjectorType: QWEN25VL = "qwen2.5vl_merger" ULTRAVOX = "ultravox" INTERNVL = "internvl" + JINACLIP2 = "jinaclip2" QWEN2A = "qwen2a" # audio QWEN25O = "qwen2.5o" # omni VOXTRAL = "voxtral" diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 1669fad99b36b..86b19c53c5203 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -39,6 +39,8 @@ #define KEY_FEATURE_LAYER "clip.vision.feature_layer" #define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" #define KEY_SPATIAL_MERGE_SIZE "clip.vision.spatial_merge_size" +// RoPE base (theta) for vision encoder +#define KEY_VISION_ROPE_THETA "clip.vision.rope_theta" #define KEY_MM_PATCH_MERGE_TYPE "clip.vision.mm_patch_merge_type" #define KEY_IMAGE_GRID_PINPOINTS "clip.vision.image_grid_pinpoints" @@ -67,14 +69,15 @@ #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_OUTPUT "%s.blk.%d.attn_out.%s" +#define TN_ATTN_LN "%s.blk.%d.attn_ln.%s" // inner attention LayerNorm #define TN_ATTN_K_NORM "%s.blk.%d.attn_k_norm.%s" #define TN_ATTN_Q_NORM "%s.blk.%d.attn_q_norm.%s" #define TN_FFN_DOWN "%s.blk.%d.ffn_down.%s" #define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" #define TN_FFN_UP "%s.blk.%d.ffn_up.%s" -#define TN_FFN_GATE "%s.blk.%d.ffn_gate.%s" -#define TN_LN_1 "%s.blk.%d.ln1.%s" // layer norm -#define TN_LN_2 "%s.blk.%d.ln2.%s" // layer norm +#define TN_FFN_NORM "%s.blk.%d.ffn_norm.%s" +#define TN_LN_1 "%s.blk.%d.ln_1.%s" // layer norm +#define TN_LN_2 "%s.blk.%d.ln_2.%s" // layer norm #define TN_LS_1 "%s.blk.%d.ls1.%s" // layer scale #define TN_LS_2 "%s.blk.%d.ls2.%s" // layer scale #define TN_LN_PRE "%s.pre_ln.%s" @@ -137,30 +140,32 @@ enum projector_type { PROJECTOR_TYPE_QWEN2A, PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx PROJECTOR_TYPE_VOXTRAL, + PROJECTOR_TYPE_JINACLIP2, // JinaCLIP v2 PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_UNKNOWN, }; static std::map PROJECTOR_TYPE_NAMES = { - { PROJECTOR_TYPE_MLP, "mlp" }, - { PROJECTOR_TYPE_LDP, "ldp" }, - { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_MINICPMV, "resampler"}, - { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, - { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, - { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, - { PROJECTOR_TYPE_GEMMA3, "gemma3"}, - { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, - { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, - { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, - { PROJECTOR_TYPE_INTERNVL, "internvl"}, - { PROJECTOR_TYPE_LLAMA4, "llama4"}, - { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, - { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, - { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, - { PROJECTOR_TYPE_LFM2, "lfm2"}, - { PROJECTOR_TYPE_KIMIVL, "kimivl"}, + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2" }, + { PROJECTOR_TYPE_MINICPMV, "resampler" }, + { PROJECTOR_TYPE_GLM_EDGE, "adapter" }, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger" }, + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger" }, + { PROJECTOR_TYPE_GEMMA3, "gemma3" }, + { PROJECTOR_TYPE_IDEFICS3, "idefics3" }, + { PROJECTOR_TYPE_PIXTRAL, "pixtral" }, + { PROJECTOR_TYPE_ULTRAVOX, "ultravox" }, + { PROJECTOR_TYPE_INTERNVL, "internvl" }, + { PROJECTOR_TYPE_LLAMA4, "llama4" }, + { PROJECTOR_TYPE_QWEN2A, "qwen2a" }, + { PROJECTOR_TYPE_QWEN25O, "qwen2.5o" }, + { PROJECTOR_TYPE_VOXTRAL, "voxtral" }, + { PROJECTOR_TYPE_JINACLIP2, "jinaclip2" }, + { PROJECTOR_TYPE_LFM2, "lfm2" }, + { PROJECTOR_TYPE_KIMIVL, "kimivl" }, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index f2abf88523843..0c736b79e7182 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -218,6 +218,10 @@ struct clip_layer { ggml_tensor * o_w = nullptr; ggml_tensor * o_b = nullptr; + // inner attention LayerNorm (JinaCLIP specific) + ggml_tensor * attn_ln_w = nullptr; + ggml_tensor * attn_ln_b = nullptr; + ggml_tensor * k_norm = nullptr; ggml_tensor * q_norm = nullptr; @@ -232,6 +236,10 @@ struct clip_layer { ggml_tensor * ff_down_w = nullptr; ggml_tensor * ff_down_b = nullptr; + // FFN sub-layer norm (GGUF: v.blk.{i}.ffn_norm.{weight,bias}) + ggml_tensor * ffn_norm_w = nullptr; + ggml_tensor * ffn_norm_b = nullptr; + // layernorm 2 ggml_tensor * ln_2_w = nullptr; ggml_tensor * ln_2_b = nullptr; @@ -424,9 +432,8 @@ struct clip_ctx { backend_ptrs.push_back(backend_cpu); backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); - sched.reset( - ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) - ); + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, + false, true)); } ~clip_ctx() { @@ -762,8 +769,8 @@ struct clip_graph { ggml_tensor * attn_mask = full_attn ? nullptr : window_mask; - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, attn_mask, kq_scale, il); + cur = build_attn(layer.o_w, layer.o_b, layer.attn_ln_w, layer.attn_ln_b, Qcur, Kcur, Vcur, attn_mask, + kq_scale, il); cb(cur, "attn_out", il); } @@ -779,11 +786,8 @@ struct clip_graph { cb(cur, "ffn_inp_normed", il); // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w, + layer.ff_down_b, nullptr, nullptr, hparams.ffn_op, il); cb(cur, "ffn_out", il); @@ -895,10 +899,9 @@ struct clip_graph { cb(K, "resampler_K", -1); cb(V, "resampler_V", -1); - embeddings = build_attn( - model.mm_model_attn_o_w, - model.mm_model_attn_o_b, - Q, K, V, nullptr, kq_scale, -1); + embeddings = build_attn(model.mm_model_attn_o_w, model.mm_model_attn_o_b, nullptr, + nullptr, // no inner attn LayerNorm for resampler + Q, K, V, nullptr, kq_scale, -1); cb(embeddings, "resampler_attn_out", -1); } // layernorm @@ -1229,8 +1232,7 @@ struct clip_graph { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cur = build_attn(layer.o_w, layer.o_b, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -1246,11 +1248,8 @@ struct clip_graph { cb(cur, "ffn_inp_normed", il); // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - hparams.ffn_op, il); + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w, + layer.ff_down_b, nullptr, nullptr, hparams.ffn_op, il); cb(cur, "ffn_out", il); @@ -1509,6 +1508,177 @@ struct clip_graph { return gf; } + ggml_cgraph * build_jina() { + const int n_pos = n_patches + (model.class_embedding ? 1 : 0); + + GGML_ASSERT(n_patches_x == n_patches_y && "only square images supported"); + + // Calculate the deepest feature layer based on hparams and projector type + int max_feature_layer = n_layer; + + ggml_tensor * inp = build_inp(); + + if (ctx->proj_type() == PROJECTOR_TYPE_JINACLIP2) { + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(pos_h, "pos_h"); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_h); + ggml_set_input(pos_w); + ggml_build_forward_expand(gf, pos_h); + ggml_build_forward_expand(gf, pos_w); + + // per-dim frequency scaling vectors for RoPE halves (default c-tensor path) + ggml_tensor * rope_c_first = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head/2); + ggml_tensor * rope_c_second = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, d_head/2); + ggml_set_name(rope_c_first, "rope_c_first"); + ggml_set_name(rope_c_second, "rope_c_second"); + ggml_set_input(rope_c_first); + ggml_set_input(rope_c_second); + ggml_build_forward_expand(gf, rope_c_first); + ggml_build_forward_expand(gf, rope_c_second); + + } + // concat class_embeddings and patch_embeddings + if (model.class_embedding) { + inp = ggml_concat(ctx0, model.class_embedding, inp, 1); + } + + // Position encoding setup - different for JinaCLIP (2D RoPE) vs others (1D position embeddings) + ggml_tensor * positions = nullptr; + + // JinaCLIP uses BOTH position embeddings AND 2D RoPE + // 1) add learned position embeddings (x = x + pos_embed) + positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos); + ggml_set_name(positions, "positions"); + ggml_set_input(positions); + + inp = ggml_add(ctx0, inp, ggml_get_rows(ctx0, model.position_embeddings, positions)); + + ggml_tensor * inpL = inp; + + // pre-layernorm + if (model.pre_ln_w) { + inpL = build_norm(inpL, model.pre_ln_w, model.pre_ln_b, NORM_TYPE_NORMAL, eps, -1); + } + std::vector embedding_stack; + const auto & vision_feature_layer = hparams.vision_feature_layer; + + // loop over layers + for (int il = 0; il < max_feature_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states + + // If this is an embedding feature layer, save the output. + // NOTE: 0 index here refers to the input to the encoder. + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { + embedding_stack.push_back(cur); + } + + // layernorm + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + + // self-attention + { + ggml_tensor * Qcur = ggml_mul_mat(ctx0, layer.q_w, cur); + if (layer.q_b) { + Qcur = ggml_add(ctx0, Qcur, layer.q_b); + } + + ggml_tensor * Kcur = ggml_mul_mat(ctx0, layer.k_w, cur); + if (layer.k_b) { + Kcur = ggml_add(ctx0, Kcur, layer.k_b); + } + + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.v_w, cur); + if (layer.v_b) { + Vcur = ggml_add(ctx0, Vcur, layer.v_b); + } + Qcur = ggml_reshape_4d(ctx0, Qcur, d_head, n_head, n_pos, 1); + Kcur = ggml_reshape_4d(ctx0, Kcur, d_head, n_head, n_pos, 1); + Vcur = ggml_reshape_4d(ctx0, Vcur, d_head, n_head, n_pos, 1); + + ggml_tensor * Q_rope_in = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + ggml_tensor * K_rope_in = ggml_permute(ctx0, Kcur, 0, 2, 1, 3); + + // Apply 2D RoPE position encoding for JinaCLIP (only to patch tokens, skip CLS token) + // JinaCLIP has CLS token at position 0, only apply RoPE to patch tokens (position 1+) + // Following Python implementation: q[:, :, 1:, :] gets RoPE applied + + // Split Q: CLS token (pos 0) + patch tokens (pos 1+) + ggml_tensor * Q_cls = + ggml_view_3d(ctx0, Q_rope_in, d_head, 1, n_head, Q_rope_in->nb[1], Q_rope_in->nb[2], 0); + ggml_tensor * Q_patches = ggml_view_3d(ctx0, Q_rope_in, d_head, n_pos - 1, n_head, Q_rope_in->nb[1], + Q_rope_in->nb[2], Q_rope_in->nb[1]); + + // Split K: CLS token (pos 0) + patch tokens (pos 1+) + ggml_tensor * K_cls = + ggml_view_3d(ctx0, K_rope_in, d_head, 1, n_head, K_rope_in->nb[1], K_rope_in->nb[2], 0); + ggml_tensor * K_patches = ggml_view_3d(ctx0, K_rope_in, d_head, n_pos - 1, n_head, K_rope_in->nb[1], + K_rope_in->nb[2], K_rope_in->nb[1]); + + int pt_seq_len = 16; // fallback pretrain length + if (hparams.patch_size > 0) { + int cand = (int) llroundf(224.0f / (float) hparams.patch_size); + if (cand > 0) { + pt_seq_len = cand; + } + } + const int hw_seq_len = static_cast(sqrtf(n_pos - 1)); // image grid size (excluding CLS) + Q_patches = build_jinaclip_rope(ctx0, ctx, Q_patches, pt_seq_len, hw_seq_len, hparams.rope_theta, true, + true, il); + K_patches = build_jinaclip_rope(ctx0, ctx, K_patches, pt_seq_len, hw_seq_len, hparams.rope_theta, true, + false, il); + GGML_ASSERT(Q_cls->ne[0] == Q_patches->ne[0]); + GGML_ASSERT(Q_cls->ne[2] == Q_patches->ne[2]); + GGML_ASSERT(Q_cls->ne[3] == Q_patches->ne[3]); + + // Recombine: CLS token + RoPE-processed patch tokens (seq dimension is 1 now) + ggml_tensor * Q_rope_out = ggml_concat(ctx0, Q_cls, Q_patches, 1); + ggml_tensor * K_rope_out = ggml_concat(ctx0, K_cls, K_patches, 1); + + // Restore to pre-permute layout [dim, head, seq, 1] + Qcur = ggml_permute(ctx0, Q_rope_out, 0, 2, 1, 3); + Kcur = ggml_permute(ctx0, K_rope_out, 0, 2, 1, 3); + + cur = build_attn(layer.o_w, layer.o_b, layer.attn_ln_w, layer.attn_ln_b, Qcur, Kcur, Vcur, nullptr, + kq_scale, il); + } + + // re-add the layer input, e.g., residual + cur = ggml_add(ctx0, cur, inpL); + cb(inpL, "inp_after_attn", il); + + inpL = cur; // inpL = residual, cur = hidden_states + + cb(cur, "ffn_inp", il); + + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_inp_normed", il); + // ffn + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w, + layer.ff_down_b, layer.ffn_norm_w, layer.ffn_norm_b, hparams.ffn_op, il); + + // residual 2 + cur = ggml_add(ctx0, inpL, cur); + + inpL = cur; + } + + // post-layernorm + if (model.post_ln_w) { + inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, NORM_TYPE_NORMAL, eps, -1); + } + // final + + ggml_tensor * emb2d = ggml_reshape_2d(ctx0, inpL, inpL->ne[0], inpL->ne[1]); + ggml_tensor * cls = ggml_view_2d(ctx0, emb2d, emb2d->ne[0], /*rows=*/1, emb2d->nb[1], /*offset=*/0); + ggml_set_name(cls, "cls_view"); + ggml_build_forward_expand(gf, cls); + return gf; + } + // whisper encoder with custom projector ggml_cgraph * build_whisper_enc() { const int n_frames = img.nx; @@ -1727,8 +1897,8 @@ struct clip_graph { cb(Kcur, "Kcur_pos", il); } - cur = build_attn(layer.o_w, layer.o_b, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cur = build_attn(layer.o_w, layer.o_b, layer.attn_ln_w, layer.attn_ln_b, Qcur, Kcur, Vcur, nullptr, + kq_scale, il); cb(cur, "attn_out", il); } @@ -1749,11 +1919,8 @@ struct clip_graph { cb(cur, "ffn_inp_normed", il); // ffn - cur = build_ffn(cur, - layer.ff_up_w, layer.ff_up_b, - layer.ff_gate_w, layer.ff_gate_b, - layer.ff_down_w, layer.ff_down_b, - ffn_t, il); + cur = build_ffn(cur, layer.ff_up_w, layer.ff_up_b, layer.ff_gate_w, layer.ff_gate_b, layer.ff_down_w, + layer.ff_down_b, nullptr, nullptr, ffn_t, il); cb(cur, "ffn_out", il); @@ -1837,17 +2004,17 @@ struct clip_graph { return cur; } - ggml_tensor * build_ffn( - ggml_tensor * cur, - ggml_tensor * up, - ggml_tensor * up_b, - ggml_tensor * gate, - ggml_tensor * gate_b, - ggml_tensor * down, - ggml_tensor * down_b, - ffn_op_type type_op, - int il) const { - + ggml_tensor * build_ffn(ggml_tensor * cur, + ggml_tensor * up, + ggml_tensor * up_b, + ggml_tensor * gate, + ggml_tensor * gate_b, + ggml_tensor * down, + ggml_tensor * down_b, + ggml_tensor * ffn_norm_w, + ggml_tensor * ffn_norm_b, + ffn_op_type type_op, + int il) const { ggml_tensor * tmp = up ? ggml_mul_mat(ctx0, up, cur) : cur; cb(tmp, "ffn_up", il); @@ -1904,6 +2071,12 @@ struct clip_graph { } break; } + // Apply FFN sub-layer LayerNorm if provided: x = ffn_ln(x) + if (ffn_norm_w || ffn_norm_b) { + cur = build_norm(cur, ffn_norm_w, ffn_norm_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_norm", il); + } + if (down) { cur = ggml_mul_mat(ctx0, down, cur); } @@ -1914,20 +2087,22 @@ struct clip_graph { if (down_b) { cur = ggml_add(ctx0, cur, down_b); + cb(cur, "ffn_down_b", il); } return cur; } - ggml_tensor * build_attn( - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - ggml_tensor * kq_mask, - float kq_scale, - int il) const { + ggml_tensor * build_attn(ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * attn_ln_w, + ggml_tensor * attn_ln_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_mask, + float kq_scale, + int il) const { // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced ggml_build_forward_expand(gf, q_cur); @@ -1965,6 +2140,21 @@ struct clip_graph { cb(cur, "kqv_out", il); + // inner attention LayerNorm (JinaCLIP specific) + if (attn_ln_w && attn_ln_b) { + ggml_tensor * attn_ln_w_f32 = attn_ln_w; + ggml_tensor * attn_ln_b_f32 = attn_ln_b; + + if (attn_ln_w->type == GGML_TYPE_F16) { + attn_ln_w_f32 = ggml_cast(ctx0, attn_ln_w, GGML_TYPE_F32); + } + if (attn_ln_b->type == GGML_TYPE_F16) { + attn_ln_b_f32 = ggml_cast(ctx0, attn_ln_b, GGML_TYPE_F32); + } + // use model epsilon from metadata for consistency + cur = build_norm(cur, attn_ln_w_f32, attn_ln_b_f32, NORM_TYPE_NORMAL, hparams.eps, il); + } + if (wo) { cur = ggml_mul_mat(ctx0, wo, cur); } @@ -1976,6 +2166,92 @@ struct clip_graph { return cur; } + + // JinaCLIP RoPE: implement VisionRotaryEmbeddingFast via ggml_rope_ext with constant frequency scaling s = pt/ft + ggml_tensor * build_jinaclip_rope(ggml_context * ctx0, + clip_ctx * /*ctx*/, + ggml_tensor * cur, + const int pt_seq_len, + const int ft_seq_len, + const float freq_base, + const bool has_cls_token = true, + const bool /*if_query*/ = true, + const int /*layer_id*/ = -1) { + (void) pt_seq_len; + (void) ft_seq_len; + const int64_t n_dim = cur->ne[0]; + const int64_t n_pos_patches = cur->ne[1]; // seq length (patches-only) + const int64_t n_head = cur->ne[2]; + + GGML_ASSERT(n_dim % 2 == 0); + const int64_t half = n_dim/2; + + // constant frequency scaling (VisionRotaryEmbeddingFast): s = pt/ft + // use c-tensor path by default; freq_scale fixed to 1.0 + + ggml_tensor * pos_h_full = ggml_graph_get_tensor(gf, "pos_h"); + ggml_tensor * pos_w_full = ggml_graph_get_tensor(gf, "pos_w"); + GGML_ASSERT(pos_h_full && pos_w_full); + + const int64_t offset = has_cls_token ? 1 : 0; + ggml_tensor * pos_h = ggml_view_1d(ctx0, pos_h_full, n_pos_patches, offset * (int64_t)ggml_element_size(pos_h_full)); + ggml_tensor * pos_w = ggml_view_1d(ctx0, pos_w_full, n_pos_patches, offset * (int64_t)ggml_element_size(pos_w_full)); + // fixed H/W selection; remove debug env toggles + ggml_tensor * pos_a = pos_h; + ggml_tensor * pos_b = pos_w; + + + ggml_tensor * first = ggml_view_3d(ctx0, cur, + half, n_head, n_pos_patches, + /*nb1 for head*/ cur->nb[2], + /*nb2 for seq */ cur->nb[1], + 0); + ggml_tensor * c_first = ggml_graph_get_tensor(gf, "rope_c_first"); + ggml_tensor * c_second = ggml_graph_get_tensor(gf, "rope_c_second"); + GGML_ASSERT(c_first && c_second); + + ggml_tensor * first_rot = ggml_rope_ext( + ctx0, + first, + pos_a, + c_first, + half, + 0, + 0, + freq_base, + 1.0f, + 0.0f, 1.0f, 0.0f, 0.0f); + first = ggml_view_3d(ctx0, first_rot, + half, n_pos_patches, n_head, + /*nb1 for seq */ first_rot->nb[2], + /*nb2 for head*/ first_rot->nb[1], + 0); + + ggml_tensor * second_hs = ggml_view_3d(ctx0, cur, + half, n_head, n_pos_patches, + /*nb1 for head*/ cur->nb[2], + /*nb2 for seq */ cur->nb[1], + /*offset*/ half * ggml_element_size(cur)); + ggml_tensor * second_rot = ggml_rope_ext( + ctx0, + second_hs, + pos_b, + c_second, + half, + 0, + 0, + freq_base, + 1.0f, + 0.0f, 1.0f, 0.0f, 0.0f); + ggml_tensor * second = ggml_view_3d(ctx0, second_rot, + half, n_pos_patches, n_head, + /*nb1 for seq */ second_rot->nb[2], + /*nb2 for head*/ second_rot->nb[1], + 0); + ggml_tensor * result = ggml_concat(ctx0, first, second, 0); + return result; + } + // implementation of the 2D RoPE without adding a new op in ggml // this is not efficient (use double the memory), but works on all backends // TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065 @@ -2121,6 +2397,11 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { res = graph.build_whisper_enc(); } break; + case PROJECTOR_TYPE_JINACLIP2: + { + res = graph.build_jina(); + } + break; case PROJECTOR_TYPE_KIMIVL: { res = graph.build_kimivl(); @@ -2218,7 +2499,7 @@ struct clip_model_loader { model.modality = modality; - // projector type + // projector type & optional version std::string proj_type; { // default key @@ -2247,6 +2528,8 @@ struct clip_model_loader { ? PROJECTOR_TYPE_QWEN25VL : PROJECTOR_TYPE_QWEN2A; } + + // JinaCLIP versions are distinguished by projector_type (e.g., jinaclip2) } const bool is_vision = model.modality == CLIP_MODALITY_VISION; @@ -2427,6 +2710,13 @@ struct clip_model_loader { get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor); set_llava_uhd_res_candidates(model, 3); } break; + case PROJECTOR_TYPE_JINACLIP2: + { + // default RoPE base for vision; prefer GGUF override if present + hparams.rope_theta = 10000.0f; + get_f32(KEY_VISION_ROPE_THETA, hparams.rope_theta, /*required=*/false); + } + break; case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_QWEN2A: case PROJECTOR_TYPE_VOXTRAL: @@ -2545,6 +2835,12 @@ struct clip_model_loader { layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false); layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false); layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false); + // JinaCLIP-specific inner attention LayerNorm is optional for other projectors + { + const bool need_attn_ln = model.proj_type == PROJECTOR_TYPE_JINACLIP2; + layer.attn_ln_w = get_tensor(string_format(TN_ATTN_LN, prefix, il, "weight"), need_attn_ln); + layer.attn_ln_b = get_tensor(string_format(TN_ATTN_LN, prefix, il, "bias"), need_attn_ln); + } layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false); layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false); @@ -2555,22 +2851,20 @@ struct clip_model_loader { layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false); layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight")); layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false); + layer.ffn_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight"), false); + layer.ffn_norm_b = get_tensor(string_format(TN_FFN_NORM, prefix, il, "bias"), false); // some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here // note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check! - bool is_ffn_swapped = ( + bool is_ffn_swapped = + ( // only old models need this fix - model.proj_type == PROJECTOR_TYPE_MLP - || model.proj_type == PROJECTOR_TYPE_MLP_NORM - || model.proj_type == PROJECTOR_TYPE_LDP - || model.proj_type == PROJECTOR_TYPE_LDPV2 - || model.proj_type == PROJECTOR_TYPE_QWEN2VL - || model.proj_type == PROJECTOR_TYPE_QWEN25VL - || model.proj_type == PROJECTOR_TYPE_GLM_EDGE - || model.proj_type == PROJECTOR_TYPE_GEMMA3 - || model.proj_type == PROJECTOR_TYPE_IDEFICS3 - || model.proj_type == PROJECTOR_TYPE_MINICPMV - ) && layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd; + model.proj_type == PROJECTOR_TYPE_MLP || model.proj_type == PROJECTOR_TYPE_MLP_NORM || + model.proj_type == PROJECTOR_TYPE_LDP || model.proj_type == PROJECTOR_TYPE_LDPV2 || + model.proj_type == PROJECTOR_TYPE_QWEN2VL || model.proj_type == PROJECTOR_TYPE_QWEN25VL || + model.proj_type == PROJECTOR_TYPE_GLM_EDGE || model.proj_type == PROJECTOR_TYPE_GEMMA3 || + model.proj_type == PROJECTOR_TYPE_IDEFICS3 || model.proj_type == PROJECTOR_TYPE_MINICPMV) && + layer.ff_up_w && layer.ff_down_w && layer.ff_down_w->ne[0] == hparams.n_embd; if (is_ffn_swapped) { // swap up and down weights ggml_tensor * tmp = layer.ff_up_w; @@ -2766,6 +3060,13 @@ struct clip_model_loader { model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); } break; + case PROJECTOR_TYPE_JINACLIP2: + { + // JinaCLIP is a pure vision encoder without separate projection layers + // It only uses patch embedding projections + // No additional mm projection tensors are loaded for JinaCLIP + } + break; default: GGML_ASSERT(false && "unknown projector type"); } @@ -3646,9 +3947,44 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str res_imgs->grid_y = inst.grid_size.height; return true; - } else if ( ctx->proj_type() == PROJECTOR_TYPE_LFM2 - || ctx->proj_type() == PROJECTOR_TYPE_KIMIVL - ) { + } else if (ctx->proj_type() == PROJECTOR_TYPE_JINACLIP2) { + clip_image_u8 processed_image; + int sz = params.image_size; // e.g., 512 for JinaCLIP + + // 1) Preserve aspect ratio: resize so that the shorter side == sz (bicubic) + int in_w = img->nx; + int in_h = img->ny; + int out_w, out_h; + if (in_w <= 0 || in_h <= 0) { + LOG_ERR("%s: invalid input image size %dx%d\n", __func__, in_w, in_h); + return false; + } + if (in_w < in_h) { + out_w = sz; + out_h = std::max(1, (int) std::round((double) in_h * sz / in_w)); + } else { + out_h = sz; + out_w = std::max(1, (int) std::round((double) in_w * sz / in_h)); + } + + clip_image_u8 resized_keep_ratio; + image_manipulation::bicubic_resize(*img, resized_keep_ratio, out_w, out_h); + + // 2) Center-crop to sz x sz + int x0 = std::max(0, (resized_keep_ratio.nx - sz) / 2); + int y0 = std::max(0, (resized_keep_ratio.ny - sz) / 2); + int crop_w = std::min(sz, resized_keep_ratio.nx); + int crop_h = std::min(sz, resized_keep_ratio.ny); + + image_manipulation::crop_image(resized_keep_ratio, processed_image, x0, y0, crop_w, crop_h); + + // 3) Normalize + clip_image_f32_ptr img_f32(clip_image_f32_init()); + normalize_image_u8_to_f32(processed_image, *img_f32, params.image_mean, params.image_std); + res_imgs->entries.push_back(std::move(img_f32)); + return true; + + } else if (ctx->proj_type() == PROJECTOR_TYPE_LFM2 || ctx->proj_type() == PROJECTOR_TYPE_KIMIVL) { GGML_ASSERT(params.proj_scale_factor); // smart resize @@ -3803,6 +4139,11 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // do nothing } break; + case PROJECTOR_TYPE_JINACLIP2: + { + n_patches = 1; + } + break; case PROJECTOR_TYPE_LDP: case PROJECTOR_TYPE_LDPV2: case PROJECTOR_TYPE_GLM_EDGE: @@ -4271,6 +4612,60 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_i32("positions", positions); } break; + case PROJECTOR_TYPE_JINACLIP2: + { + // learned pos emb (CLS + patches) + std::vector positions(n_pos); + for (int i = 0; i < n_pos; i++) { + positions[i] = i; // 0, 1, 2, ... (CLS=0, patches=1+) + } + set_input_i32("positions", positions); + + // 2D RoPE positions (H/W), CLS kept at 0 + const int n_patches = model.class_embedding ? (n_pos - 1) : n_pos; + const int n_patches_per_col = image_size_width / patch_size; + std::vector pos_data(n_pos, 0); + // H dimension + for (int i = 0; i < n_patches; ++i) { + int idx = model.class_embedding ? (i + 1) : i; + pos_data[idx] = i / n_patches_per_col; + } + set_input_i32("pos_h", pos_data); + // W dimension + std::fill(pos_data.begin(), pos_data.end(), 0); + for (int i = 0; i < n_patches; ++i) { + int idx = model.class_embedding ? (i + 1) : i; + pos_data[idx] = i % n_patches_per_col; + } + set_input_i32("pos_w", pos_data); + + // fill rope_c_first/second (c-tensor default): c_first = 1/s, c_second = 1/(s*odd) + int pt_seq_len = 16; + if (patch_size > 0) { + int cand = (int) llroundf(224.0f / (float) patch_size); + if (cand > 0) pt_seq_len = cand; + } + float s = (float) pt_seq_len / (float) n_patches_per_col; + int d_head_local = hparams.n_embd / hparams.n_head; + int half_local = d_head_local/2; + std::vector rope_c_first(half_local); + std::vector rope_c_second(half_local); + float odd = std::pow(hparams.rope_theta, (float)-2.0f / (float)d_head_local); + for (int k = 0; k < half_local; ++k) { + rope_c_first[k] = 1.0f / s; + rope_c_second[k] = 1.0f / (s * odd); + } + // set inputs + auto t1 = ggml_graph_get_tensor(gf, "rope_c_first"); + auto t2 = ggml_graph_get_tensor(gf, "rope_c_second"); + GGML_ASSERT(t1 && (t1->flags & GGML_TENSOR_FLAG_INPUT)); + GGML_ASSERT(t2 && (t2->flags & GGML_TENSOR_FLAG_INPUT)); + ggml_backend_tensor_set(t1, rope_c_first.data(), 0, ggml_nbytes(t1)); + ggml_backend_tensor_set(t2, rope_c_second.data(), 0, ggml_nbytes(t2)); + + // unified ggml_rope_ext path; no precomputed cos/sin tables + } + break; case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_MLP_NORM: case PROJECTOR_TYPE_LDP: @@ -4340,17 +4735,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return false; } - // print debug nodes - if (ctx->debug_graph) { - LOG_INF("\n\n---\n\n"); - LOG_INF("\n\nDebug graph:\n\n"); - for (ggml_tensor * t : ctx->debug_print_tensors) { - std::vector data(ggml_nbytes(t)); - ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t)); - print_tensor_shape(t); - print_tensor_data(t, data.data(), 3); - } - } + // debug printing and env-based dumps removed // the last node is the embedding tensor ggml_tensor * embeddings = ggml_graph_node(gf, -1); @@ -4378,6 +4763,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_MLP: case PROJECTOR_TYPE_PIXTRAL: return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_JINACLIP2: + return ctx->model.hparams.projection_dim; case PROJECTOR_TYPE_MLP_NORM: return ctx->model.mm_3_b->ne[0]; case PROJECTOR_TYPE_MINICPMV: @@ -4432,6 +4819,10 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) { return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3; } +bool clip_is_jinaclip2(const struct clip_ctx * ctx) { + return ctx->proj_type() == PROJECTOR_TYPE_JINACLIP2; +} + bool clip_has_vision_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_VISION; } diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 3387cdbd36955..65f9b4185fc46 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -95,6 +95,7 @@ bool clip_is_glm(const struct clip_ctx * ctx); bool clip_is_qwen2vl(const struct clip_ctx * ctx); bool clip_is_llava(const struct clip_ctx * ctx); bool clip_is_gemma3(const struct clip_ctx * ctx); +bool clip_is_jinaclip2(const struct clip_ctx * ctx); bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec); diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 5fde6ca0c32ae..556d617abe1e8 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -39,7 +39,13 @@ static void show_additional_info(int /*argc*/, char ** argv) { LOG( "Experimental CLI for multimodal\n\n" "Usage: %s [options] -m --mmproj --image --audio