From 466f909b1e3953e7adc51a30be52e1b2a59552db Mon Sep 17 00:00:00 2001 From: YaelGitAccount Date: Mon, 3 Nov 2025 16:05:51 +0200 Subject: [PATCH 1/8] feat(gguf): add Eagle2-VL mmproj support (register arch + model_type branch) Co-authored-by: YaelLogic --- convert_hf_to_gguf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c6f5ba6a04c54..cfece431dda5e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3577,7 +3577,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] -@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") +@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration", "Eagle2_5_VLForConditionalGeneration") class Qwen2VLVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -3612,6 +3612,9 @@ def set_gguf_parameters(self): if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern: raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + elif model_type in ['eagle_2_5_vl', 'eagle2_vl', 'eagle2_5_vl']: + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) + self.gguf_writer.add_vision_use_silu(True) else: raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") # default values below are taken from HF tranformers code From 9a2e7b4bdf97755d0492444606b7e394ffd42aa6 Mon Sep 17 00:00:00 2001 From: Yael Logic Date: Thu, 6 Nov 2025 16:12:44 +0200 Subject: [PATCH 2/8] tools/mtmd/clip.cpp: Eagle2_VL mmproj fix Co-authored-by: YaelGitAccount --- tools/mtmd/clip.cpp | 165 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 155 insertions(+), 10 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 99775cb3e351c..c65a6bab9aafa 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -185,6 +186,11 @@ struct clip_hparams { patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT; + int32_t patch_merge_factor = 1; + std::string patch_merge_mode = "flat"; + int32_t grid_h = 0; + int32_t grid_w = 0; + float eps = 1e-6; float rope_theta = 0.0; @@ -667,9 +673,57 @@ struct clip_graph { // LlavaMultiModalProjector (always using GELU activation) { - cur = ggml_mul_mat(ctx0, model.mm_1_w, cur); - if (model.mm_1_b) { - cur = ggml_add(ctx0, cur, model.mm_1_b); + // Eagle2-VL: Apply patch merge before MLP projection if n_merge > 1 + // Prefer clip.vision.spatial_merge_size; treat n_merge==1 as no-merge + if (hparams.n_merge > 1 && + (model.proj_type == PROJECTOR_TYPE_MLP || model.proj_type == PROJECTOR_TYPE_MLP_NORM)) { + const int scale_factor = hparams.n_merge; + // minimal debug: pre-merge C/T + { + int C = (int) cur->ne[0]; + int T = (int) cur->ne[1]; + printf("[E2VL] pre-merge: C=%d, T=%d\n", C, T); + } + cur = build_patch_merge_permute(cur, scale_factor); + // minimal debug: post-merge C/T + { + int C_new = (int) cur->ne[0]; + int T_new = (int) cur->ne[1]; + printf("[E2VL] post-merge: C=%d, T=%d\n", C_new, T_new); + } + } + + // Use mm_0_w/mm_0_b if available (Eagle2-VL), otherwise mm_1_w/mm_1_b (standard LLaVA) + ggml_tensor * first_w = model.mm_0_w ? model.mm_0_w : model.mm_1_w; + ggml_tensor * first_b = model.mm_0_b ? model.mm_0_b : model.mm_1_b; + + // Debug shapes before projection to catch matmul mismatches + LOG_INF("%s: eagle2-mlp: cur shape: [%lld, %lld, %lld]\n", __func__, (long long) cur->ne[0], + (long long) cur->ne[1], (long long) cur->ne[2]); + if (first_w) { + LOG_INF("%s: eagle2-mlp: first_w: [%lld, %lld]\n", __func__, (long long) first_w->ne[0], + (long long) first_w->ne[1]); + } + + // Ensure 2D and correct orientation for matmul: first_w[out,in] x cur[in, tokens] + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1]); + // minimal debug: shapes before first matmul in this block + if (first_w) { + int emb_C = (int) cur->ne[0]; + int emb_T = (int) cur->ne[1]; + int w0_in = (int) first_w->ne[0]; + int w0_out = (int) first_w->ne[1]; + printf("[E2VL] emb=[%d,%d], w0=[%d,%d]\n", emb_C, emb_T, w0_in, w0_out); + } + if (first_w && first_w->ne[1] != cur->ne[0]) { + LOG_WRN("%s: eagle2-mlp: dim mismatch, transposing cur: first_w[in]=%lld, cur[0]=%lld, cur[1]=%lld\n", + __func__, (long long) first_w->ne[1], (long long) cur->ne[0], (long long) cur->ne[1]); + cur = ggml_transpose(ctx0, cur); + cur = ggml_cont(ctx0, cur); + } + cur = ggml_mul_mat(ctx0, first_w, cur); + if (first_b) { + cur = ggml_add(ctx0, cur, first_b); } cur = ggml_gelu(ctx0, cur); @@ -686,8 +740,8 @@ struct clip_graph { // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] - const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; - const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; + const int p_y = n_merge > 1 ? n_patches_y / n_merge : n_patches_y; + const int p_x = n_merge > 1 ? n_patches_x / n_merge : n_patches_x; const int p_total = p_x * p_y; const int n_embd_text = cur->ne[0]; const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row @@ -710,7 +764,7 @@ struct clip_graph { // Qwen2VL and Qwen2.5VL use M-RoPE ggml_cgraph * build_qwen2vl() { - GGML_ASSERT(model.patch_bias == nullptr); + // Eagle2-VL and some variants may have patch bias GGML_ASSERT(model.class_embedding == nullptr); const int batch_size = 1; @@ -749,6 +803,12 @@ struct clip_graph { n_embd, n_patches_x * n_patches_y, batch_size); } + // add patch bias if present (Eagle2-VL has patch bias) + if (model.patch_bias != nullptr) { + inp = ggml_add(ctx0, inp, model.patch_bias); + cb(inp, "patch_bias", -1); + } + ggml_tensor * inpL = inp; ggml_tensor * window_mask = nullptr; ggml_tensor * window_idx = nullptr; @@ -867,10 +927,30 @@ struct clip_graph { inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer); } + // Apply patch merge based on metadata + // Preferred: clip.vision.spatial_merge_size -> hparams.n_merge + // Fallback: legacy keys clip.vision.patch_merge_factor/mode + bool did_spatial_merge = false; + if (hparams.n_merge > 1) { + // e.g. [1152, 1024] -> [1152 * n_merge^2, 1024 / n_merge^2] + inpL = build_patch_merge_permute(inpL, hparams.n_merge); + did_spatial_merge = true; + } else if (hparams.patch_merge_factor > 1 && + (hparams.patch_merge_mode == "concat2x2" || hparams.patch_merge_mode == "concat")) { + // legacy fallback (kept for backward compatibility) + inpL = build_patch_merge_permute(inpL, hparams.patch_merge_factor); + did_spatial_merge = true; + } + // multimodal projection ggml_tensor * embeddings = inpL; - embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + // Conditional reshape based on whether patch merge was applied + if (!did_spatial_merge) { + // Standard Qwen2VL path assumes 2x2 merge semantics without explicit permute + embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size); + } + embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); @@ -1551,12 +1631,57 @@ struct clip_graph { // llava projector if (ctx->proj_type() == PROJECTOR_TYPE_MLP) { - embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings); + // Eagle2-VL: apply 2x2 patch merge on [C, T] layout directly when n_merge > 1 + if (hparams.n_merge > 1) { + // ensure contiguous before reshape/permutation in patch merge + embeddings = ggml_cont(ctx0, embeddings); + const int scale_factor = hparams.n_merge; + // minimal debug: pre-merge C/T (embeddings) + { + int C = (int) embeddings->ne[0]; + int T = (int) embeddings->ne[1]; + printf("[E2VL] pre-merge: C=%d, T=%d\n", C, T); + } + embeddings = build_patch_merge_permute(embeddings, scale_factor); + // minimal debug: post-merge C/T (embeddings) + { + int C_new = (int) embeddings->ne[0]; + int T_new = (int) embeddings->ne[1]; + printf("[E2VL] post-merge: C=%d, T=%d\n", C_new, T_new); + } + } + LOG_INF("%s: llava-mlp before mm_0: emb[%lld, %lld], w0[%lld, %lld]\n", __func__, + (long long) embeddings->ne[0], (long long) embeddings->ne[1], (long long) model.mm_0_w->ne[0], + (long long) model.mm_0_w->ne[1]); + ggml_tensor * w0 = model.mm_0_w; + // ggml expects w->ne[0] (in_dim) == emb->ne[0]. If loader stored [out,in], fix with transpose. + // minimal debug: shapes right before mm_0 matmul + { + int emb_C = (int) embeddings->ne[0]; + int emb_T = (int) embeddings->ne[1]; + int w0_in = (int) w0->ne[0]; + int w0_out = (int) w0->ne[1]; + printf("[E2VL] emb=[%d,%d], w0=[%d,%d]\n", emb_C, emb_T, w0_in, w0_out); + } + if (w0->ne[0] != embeddings->ne[0] && w0->ne[1] == embeddings->ne[0]) { + LOG_WRN("%s: llava-mlp: transposing mm_0_w for mul_mat: w0[%lld, %lld] emb[%lld, %lld]", __func__, + (long long) w0->ne[0], (long long) w0->ne[1], (long long) embeddings->ne[0], + (long long) embeddings->ne[1]); + w0 = ggml_cont(ctx0, ggml_transpose(ctx0, w0)); + } + embeddings = ggml_mul_mat(ctx0, w0, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_0_b); embeddings = ggml_gelu(ctx0, embeddings); if (model.mm_2_w) { - embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings); + ggml_tensor * w2 = model.mm_2_w; + if (w2->ne[0] != embeddings->ne[0] && w2->ne[1] == embeddings->ne[0]) { + LOG_WRN("%s: llava-mlp: transposing mm_2_w for mul_mat: w2[%lld, %lld] emb[%lld, %lld]", + __func__, (long long) w2->ne[0], (long long) w2->ne[1], (long long) embeddings->ne[0], + (long long) embeddings->ne[1]); + w2 = ggml_cont(ctx0, ggml_transpose(ctx0, w2)); + } + embeddings = ggml_mul_mat(ctx0, w2, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); } } @@ -2706,6 +2831,12 @@ struct clip_model_loader { if (mm_patch_merge_type == "spatial_unpad") { hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD; } + + // Load Eagle2-VL specific patch merge metadata + get_i32("clip.vision.patch_merge_factor", hparams.patch_merge_factor, false); + get_string("clip.vision.patch_merge_mode", hparams.patch_merge_mode, false); + get_i32("clip.vision.grid_h", hparams.grid_h, false); + get_i32("clip.vision.grid_w", hparams.grid_w, false); } if (is_vision) { @@ -2735,6 +2866,15 @@ struct clip_model_loader { // model-specific params switch (model.proj_type) { + case PROJECTOR_TYPE_MLP: + case PROJECTOR_TYPE_MLP_NORM: + { + // Eagle2-VL: Load spatial merge size for patch merge + get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); + // minimal debug: report n_merge loaded from metadata + printf("[E2VL] n_merge=%d\n", hparams.n_merge); + } + break; case PROJECTOR_TYPE_MINICPMV: { if (hparams.minicpmv_version == 0) { @@ -4462,7 +4602,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_MLP_NORM: case PROJECTOR_TYPE_JANUS_PRO: { - // do nothing + // account for spatial patch merge when present (e.g., Eagle2-VL) + // both X and Y are downscaled by the merge factor + const int scale_factor = ctx->model.hparams.n_merge; + if (scale_factor > 0) { + n_patches /= (scale_factor * scale_factor); + } } break; case PROJECTOR_TYPE_LDP: case PROJECTOR_TYPE_LDPV2: From c5be21579f6797bdb831acfd7cb93b066ff757ce Mon Sep 17 00:00:00 2001 From: Yael Logic Date: Thu, 6 Nov 2025 16:24:29 +0200 Subject: [PATCH 3/8] convert_hf_to_gguf.py: Eagle2_VL support update Co-authored-by: YaelGitAccount --- convert_hf_to_gguf.py | 95 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cfece431dda5e..f988ed3f12540 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3583,9 +3583,11 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hparams_vision is not None self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560) - # rename config.json values - self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") - self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") + # rename config.json values for Qwen models + if self.hparams_vision.get("num_heads") is not None: + self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads") + if self.hparams_vision.get("depth") is not None: + self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth") if "embed_dim" in self.hparams_vision: # qwen2vl self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size") self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim") @@ -3613,8 +3615,43 @@ def set_gguf_parameters(self): raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}") self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) elif model_type in ['eagle_2_5_vl', 'eagle2_vl', 'eagle2_5_vl']: - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL) - self.gguf_writer.add_vision_use_silu(True) + # Eagle2-VL uses MLP projector with 2x2 patch merge + # Structure: Vision encoder → 2x2 patch merge → LayerNorm → Linear → GELU → Linear + self.gguf_writer.add_clip_projector_type("mlp") + + # Add spatial_merge_size for patch merge (stored as n_merge in hparams) + self.gguf_writer.add_vision_spatial_merge_size(2) + + # Add grid dimensions for runtime to calculate merge + image_size = self.find_vparam(["image_size"]) + patch_size = self.find_vparam(["patch_size"]) + grid_h = grid_w = image_size // patch_size + self.gguf_writer.add_key_value("clip.vision.grid_h", grid_h, gguf.GGUFValueType.INT32) + self.gguf_writer.add_key_value("clip.vision.grid_w", grid_w, gguf.GGUFValueType.INT32) + + # Eagle2-VL uses window attention similar to Qwen2.5-VL but doesn't have fullatt_block_indexes + # Set a reasonable default window attention pattern (every 4th layer uses full attention) + n_wa_pattern = 4 # Default value for Eagle2-VL based on similar models + self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern) + + # --- BEGIN: Eagle2 fallback for required vision metadata --- + assert self.hparams_vision is not None + hv = self.hparams_vision + # block_count (num of vision layers) fallback - check original vision_config first + blk = hv.get('num_hidden_layers') or hv.get('num_layers') or hv.get('n_layers') + if blk is None: + # Try to get from original vision_config before any transformations + original_vision_config = self.global_config.get('vision_config', {}) + blk = original_vision_config.get('num_hidden_layers') or original_vision_config.get('num_layers') or original_vision_config.get('n_layers') + if blk is None: + # As a last resort, try to infer from config layout if present + # (keep it simple: raise with a clear message if still missing) + raise ValueError("Eagle2: missing vision block count (num_hidden_layers/num_layers/n_layers) in vision_config") + self.gguf_writer.add_vision_block_count(int(blk)) + # (Optional) You can add other explicit fallbacks here only if they also turn out None later: + # head_count = hv.get('num_attention_heads', hv.get('num_heads')) + # if head_count is not None: self.gguf_writer.add_vision_head_count(int(head_count)) + # --- END: Eagle2 fallback --- else: raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}") # default values below are taken from HF tranformers code @@ -3627,8 +3664,47 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - if name.startswith("visual."): + if name.startswith("visual.") or name.startswith("vision_model.") or name.startswith("mlp1."): + # Skip all vision model head layers - not needed for mmproj + if ".head." in name: + return [] + + # Handle projector tensors (Eagle2-VL uses mlp1.N.weight/bias pattern) + if name.startswith("mlp1."): + # Eagle2-VL has: LayerNorm(0) → Linear(1) → GELU(2) → Linear(3) + # QWEN2VL projector expects: Linear(0) → GELU → Linear(2) + # So we need to remap: mlp1.1 → mm.0, mlp1.3 → mm.2 + # Skip mlp1.0 (LayerNorm) as it's not used by QWEN2VL projector type + if ".0." in name: + # Skip LayerNorm layer + return [] + elif ".1." in name: + # Map first Linear layer (mlp1.1) to mm.0 + # Original: [896, 4608] -> Need to transpose for GGML: [4608, 896] + if ".weight" in name: + new_name = name.replace("mlp1.1.", "mm.0.") + return [(new_name, data_torch.T)] # Transpose the weight + else: + new_name = name.replace("mlp1.1.", "mm.0.") + return [(new_name, data_torch)] + elif ".3." in name: + # Map second Linear layer (mlp1.3) to mm.2 + # Original: [896, 896] -> Need to transpose for GGML: [896, 896] (square matrix) + if ".weight" in name: + new_name = name.replace("mlp1.3.", "mm.2.") + return [(new_name, data_torch.T)] # Transpose the weight + else: + new_name = name.replace("mlp1.3.", "mm.2.") + return [(new_name, data_torch)] + else: + # Unknown mlp1 layer + return [] + # process visual tensors + # Handle Eagle2-VL specific naming: vision_model.vision_model.* -> model.vision_model.* + if name.startswith("vision_model.vision_model."): + name = name.replace("vision_model.vision_model.", "model.vision_model.") + # split QKV tensors if needed if ".qkv." in name: if data_torch.ndim == 2: # weight @@ -3656,6 +3732,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter ] else: return [(self.map_tensor_name(name), data_torch)] + elif name.startswith("multi_modal_projector."): + # Handle projector tensors (for other Qwen2.5-VL models that use multi_modal_projector prefix) + # Convert mm.model.mlp.N.weight/bias to mm.N.weight/bias pattern + new_name = name.replace("multi_modal_projector.", "") + if "mm.model.mlp." in new_name: + new_name = new_name.replace("mm.model.mlp.", "mm.") + return [(new_name, data_torch)] return [] # skip other tensors From 14c56d703e8e260ad2b0c5ebafbc0bafa5d16b98 Mon Sep 17 00:00:00 2001 From: YaelGitAccount Date: Wed, 12 Nov 2025 11:44:36 +0200 Subject: [PATCH 4/8] mtmd: instrumentation + preprocessing parity helper; stats & media marker logs for Eagle2-VL --- tools/mtmd/clip.cpp | 161 ++++++++++++++++++++++++++----- tools/mtmd/hf_preprocess_dump.py | 42 ++++++++ tools/mtmd/mtmd-cli.cpp | 35 ++++++- tools/mtmd/mtmd.cpp | 40 +++++++- 4 files changed, 251 insertions(+), 27 deletions(-) create mode 100755 tools/mtmd/hf_preprocess_dump.py diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index c65a6bab9aafa..5e522f4bd43d8 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1,3 +1,4 @@ +// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission. // NOTE: This is modified from clip.cpp only for LLaVA, // so there might be still unnecessary artifacts hanging around // I'll gradually clean and extend it @@ -1617,15 +1618,10 @@ struct clip_graph { // llava projector (also used by granite) if (ctx->model.hparams.has_llava_projector) { + // consume the full post-merge sequence directly; no row selection via patches embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); - - ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); - ggml_set_name(patches, "patches"); - ggml_set_input(patches); - - // shape [1, 576, 1024] - // ne is whcn, ne = [1024, 576, 1, 1] - embeddings = ggml_get_rows(ctx0, embeddings, patches); + // Eagle2-VL patch: explicitly log that we are NOT performing any row gather (uninitialized indices avoided) + printf("[E2VL] projector: using full sequence (no row gather)\n"); // print_tensor_info(embeddings, "embeddings"); @@ -1636,18 +1632,20 @@ struct clip_graph { // ensure contiguous before reshape/permutation in patch merge embeddings = ggml_cont(ctx0, embeddings); const int scale_factor = hparams.n_merge; - // minimal debug: pre-merge C/T (embeddings) - { - int C = (int) embeddings->ne[0]; - int T = (int) embeddings->ne[1]; - printf("[E2VL] pre-merge: C=%d, T=%d\n", C, T); - } + int C_before = (int) embeddings->ne[0]; + int T_before = (int) embeddings->ne[1]; + printf("[E2VL] pre-merge: C=%d, T=%d (scale_factor=%d)\n", C_before, T_before, scale_factor); embeddings = build_patch_merge_permute(embeddings, scale_factor); - // minimal debug: post-merge C/T (embeddings) - { - int C_new = (int) embeddings->ne[0]; - int T_new = (int) embeddings->ne[1]; - printf("[E2VL] post-merge: C=%d, T=%d\n", C_new, T_new); + int C_after = (int) embeddings->ne[0]; + int T_after = (int) embeddings->ne[1]; + printf("[E2VL] post-merge: C=%d, T=%d\n", C_after, T_after); + int expected_C = C_before * scale_factor * scale_factor; + int expected_T = T_before / (scale_factor * scale_factor); + if (C_after != expected_C || T_after != expected_T) { + printf("[E2VL] WARN: unexpected post-merge shape (possible double-merge?) got C=%d (exp %d) T=%d (exp %d)\n", + C_after, expected_C, T_after, expected_T); + } else { + printf("[E2VL] merge check: single merge confirmed (C scales by %d^2, T divides by %d^2)\n", scale_factor, scale_factor); } } LOG_INF("%s: llava-mlp before mm_0: emb[%lld, %lld], w0[%lld, %lld]\n", __func__, @@ -1683,6 +1681,8 @@ struct clip_graph { } embeddings = ggml_mul_mat(ctx0, w2, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); + // tag for post-compute stats collection + ggml_set_name(embeddings, "e2vl_proj_out"); } } else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) { @@ -3727,12 +3727,70 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { dst.nx = src.nx; dst.ny = src.ny; - dst.buf.resize(src.buf.size()); + const size_t plane_sz = (size_t) dst.nx * (size_t) dst.ny; + dst.buf.resize(3 * plane_sz); // planar RGB + + bool stats_enabled = std::getenv("E2VL_STATS") != nullptr; + double ch_sum[3] = {0.0,0.0,0.0}; + double ch_min[3] = {1e9,1e9,1e9}; + double ch_max[3] = {-1e9,-1e9,-1e9}; + std::vector ch_first8[3]; + ch_first8[0].reserve(8); ch_first8[1].reserve(8); ch_first8[2].reserve(8); + + for (int y = 0; y < dst.ny; ++y) { + for (int x = 0; x < dst.nx; ++x) { + size_t base = (size_t) y * (size_t) dst.nx + (size_t) x; + for (int c = 0; c < 3; ++c) { + size_t src_idx = 3ull * base + (size_t) c; // interleaved in src + float raw = static_cast(src.buf[src_idx]) / 255.0f; + float v = (raw - mean[c]) / std[c]; + size_t dst_idx = (size_t) c * plane_sz + base; // planar in dst + dst.buf[dst_idx] = v; + if (stats_enabled) { + ch_sum[c] += v; + ch_min[c] = std::min(ch_min[c], v); + ch_max[c] = std::max(ch_max[c], v); + if ((int)ch_first8[c].size() < 8) ch_first8[c].push_back(v); + } + } + } + } - // TODO @ngxson : seems like this could be done more efficiently on cgraph - for (size_t i = 0; i < src.buf.size(); ++i) { - int c = i % 3; // rgb - dst.buf[i] = (static_cast(src.buf[i]) / 255.0f - mean[c]) / std[c]; + if (stats_enabled) { + const double denom = double(dst.nx * dst.ny); + double ch_mean[3] = { ch_sum[0] / denom, ch_sum[1] / denom, ch_sum[2] / denom }; + printf("[E2VL] preprocess stats (RGB channel order)\n"); + for (int c = 0; c < 3; ++c) { + printf("[E2VL] channel %d first8: ", c); + for (float v : ch_first8[c]) printf(" % .6f ", v); + printf("\n"); + printf("[E2VL] channel %d min=% .6f max=% .6f mean=% .6f\n", c, ch_min[c], ch_max[c], ch_mean[c]); + } + } + + if (std::getenv("E2VL_PRE_DUMP") != nullptr) { + const char * path = std::getenv("E2VL_PRE_CPP_OUT"); + if (!path) path = "e2vl_pre_cpp.bin"; + std::vector planar(dst.buf.size()); + for (int c = 0; c < 3; ++c) { + for (int y = 0; y < dst.ny; ++y) { + for (int x = 0; x < dst.nx; ++x) { + size_t src_idx = (size_t) c * plane_sz + (size_t) y * dst.nx + (size_t) x; + size_t dst_idx = src_idx; + planar[dst_idx] = dst.buf[src_idx]; + } + } + } + FILE * f = fopen(path, "wb"); + if (f) { + fwrite(planar.data(), sizeof(float), planar.size(), f); + fclose(f); + if (stats_enabled) { + printf("[E2VL] preprocess dump written (planar RGB) path=%s size=%zu floats\n", path, (size_t)planar.size()); + } + } else if (stats_enabled) { + printf("[E2VL] WARN: failed to open preprocess dump path %s\n", path); + } } } @@ -5107,7 +5165,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima for (int i = 0; i < num_patches; i++) { patches[i] = i + patch_offset; } - set_input_i32("patches", patches); + // Make patches optional: if the graph doesn't contain an input named "patches" + // (Eagle2-VL full-sequence path), skip without aborting. + ggml_tensor * patches_tensor = ggml_graph_get_tensor(gf, "patches"); + if (patches_tensor && (patches_tensor->flags & GGML_TENSOR_FLAG_INPUT)) { + GGML_ASSERT(patches_tensor->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_nelements(patches_tensor) == (int64_t)patches.size()); + ggml_backend_tensor_set(patches_tensor, patches.data(), 0, ggml_nbytes(patches_tensor)); + } else { + // Only log in verbose contexts (llava projector present) to avoid spam for other models. + if (ctx->model.hparams.has_llava_projector) { + printf("[E2VL] no 'patches' tensor in graph (full-sequence path)\n"); + } + } } break; case PROJECTOR_TYPE_GEMMA3: case PROJECTOR_TYPE_IDEFICS3: @@ -5158,6 +5228,47 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return false; } + // E2VL projector output stats/dump (post-compute) + if (std::getenv("E2VL_STATS") != nullptr || std::getenv("E2VL_DUMP") != nullptr) { + ggml_tensor * proj = ggml_graph_get_tensor(gf, "e2vl_proj_out"); + if (proj != nullptr) { + const int64_t C = proj->ne[0]; + const int64_t T = proj->ne[1]; + const int64_t N = C * T; + std::vector buf((size_t) N); + ggml_backend_tensor_get(proj, buf.data(), 0, ggml_nbytes(proj)); + + long double sum = 0.0L, sq = 0.0L; + for (int64_t i = 0; i < N; ++i) { + const long double v = buf[(size_t) i]; + sum += v; + sq += v * v; + } + const long double mean = sum / (long double) N; + const long double var = std::max(0.0L, sq / (long double) N - mean * mean); + const long double stdv = sqrt((double) var); + const long double l2n = sqrt((double) sq); + if (std::getenv("E2VL_STATS") != nullptr) { + printf("[E2VL] projector out stats (after mm.2): shape=[%lld,%lld] mean=% .6Lf std=% .6Lf L2=% .6Lf\n", + (long long) C, (long long) T, mean, stdv, l2n); + } + if (std::getenv("E2VL_DUMP") != nullptr) { + const char * path = std::getenv("E2VL_CPP_OUT"); + if (!path) path = "e2vl_projector_cpp.bin"; + FILE * f = fopen(path, "wb"); + if (f) { + fwrite(buf.data(), sizeof(float), (size_t) N, f); + fclose(f); + printf("[E2VL] projector output dumped to %s (N=%lld)\n", path, (long long) N); + } else { + printf("[E2VL] WARN: failed to open dump path %s\n", path); + } + } + } else { + // Silent if not present to avoid noise on non-E2VL models + } + } + // print debug nodes if (ctx->debug_graph) { LOG_INF("\n\n---\n\n"); diff --git a/tools/mtmd/hf_preprocess_dump.py b/tools/mtmd/hf_preprocess_dump.py new file mode 100755 index 0000000000000..a70efe81cc233 --- /dev/null +++ b/tools/mtmd/hf_preprocess_dump.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +""" +hf_preprocess_dump.py +Create a planar C,H,W float32 dump matching the C++ E2VL preprocessing (resize->center crop->normalize SigLIP mean=0.5 std=0.5). +Usage: python3 hf_preprocess_dump.py input.jpg out.bin +""" +import sys +from PIL import Image +import numpy as np + +if len(sys.argv) < 3: + print("usage: hf_preprocess_dump.py INPUT_IMAGE OUT_BIN") + sys.exit(2) + +inp = sys.argv[1] +out = sys.argv[2] + +image_size = 448 +mean = np.array([0.5, 0.5, 0.5], dtype=np.float32) +std = np.array([0.5, 0.5, 0.5], dtype=np.float32) + +img = Image.open(inp).convert('RGB') +# Resize short side to image_size, keep aspect, then center crop +w, h = img.size +if w < h: + new_w = image_size + new_h = int(round(h * (image_size / w))) +else: + new_h = image_size + new_w = int(round(w * (image_size / h))) +img = img.resize((new_w, new_h), resample=Image.BICUBIC) +# center crop +left = (new_w - image_size) // 2 +top = (new_h - image_size) // 2 +img = img.crop((left, top, left + image_size, top + image_size)) +arr = np.array(img).astype(np.float32) / 255.0 +# Normalize (SigLIP style): (x - mean)/std +arr = (arr - mean) / std +# Convert to planar C,H,W +planar = np.transpose(arr, (2,0,1)).astype(np.float32) +planar.tofile(out) +print(f"wrote {out}: shape={planar.shape}, bytes={planar.nbytes}") diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 3e19e95958a2f..f7134d8a3f93d 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -1,3 +1,4 @@ +// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission. #include "arg.h" #include "log.h" #include "common.h" @@ -179,6 +180,34 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { generated_tokens.push_back(token_id); common_sampler_accept(ctx.smpl, token_id, true); + if (i == 0 && std::getenv("E2VL_STATS") != nullptr) { + // Dump top-10 logits used to sample the first generated token. + // Prefer llama_get_logits() over llama_get_logits_ith() since the latter may not be populated + // when using the helper chunk evaluation path. + const struct llama_vocab * v = llama_model_get_vocab(ctx.model); + const int n_vocab = llama_vocab_n_tokens(v); + const float * logits = llama_get_logits(ctx.lctx); + if (!logits) { + // fall back to ith accessor + logits = llama_get_logits_ith(ctx.lctx, 0); + } + if (logits) { + struct Item { int id; float logit; }; std::vector items; items.reserve(n_vocab); + for (int t = 0; t < n_vocab; ++t) items.push_back({t, logits[t]}); + std::partial_sort(items.begin(), items.begin()+std::min(10, items.size()), items.end(), [](const Item & a, const Item & b){return a.logit > b.logit;}); + printf("[E2VL] first-token top10 logits:\n"); + float denom = 0.0f; for (size_t j = 0; j < 10 && j < items.size(); ++j) denom += expf(items[j].logit - items[0].logit); + for (size_t k = 0; k < 10 && k < items.size(); ++k) { + auto & it = items[k]; + std::string piece = common_token_to_piece(ctx.lctx, it.id); + float prob = expf(it.logit - items[0].logit) / (denom > 0 ? denom : 1); + printf(" id=%d piece='%s' logit=% .5f approx_prob=% .5f\n", it.id, piece.c_str(), it.logit, prob); + } + } else { + printf("[E2VL] WARN: logits unavailable for first-token probe (no logits pointer)\n"); + } + } + if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) { LOG("\n"); break; // end of generation @@ -311,7 +340,11 @@ int main(int argc, char ** argv) { if (is_single_turn) { g_is_generating = true; - if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { + // TEMP: Guard against duplicate vision markers (safe for now, to be removed after EAGLE2_VL stabilization) + // Avoid auto-appending a media marker if the prompt already contains an IMG_CONTEXT placeholder + const bool has_default_media_marker = params.prompt.find(mtmd_default_marker()) != std::string::npos; + const bool has_img_context_placeholder = params.prompt.find("") != std::string::npos; + if (!has_default_media_marker && !has_img_context_placeholder) { for (size_t i = 0; i < params.image.size(); i++) { params.prompt += mtmd_default_marker(); } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 325f7ff995e36..ac499e0ff0403 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1,3 +1,4 @@ +// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission. #include "clip.h" #include "clip-impl.h" #include "mtmd.h" @@ -765,7 +766,44 @@ int32_t mtmd_tokenize(mtmd_context * ctx, const mtmd_bitmap ** bitmaps, size_t n_bitmaps) { mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps); - return tokenizer.tokenize(output); + int32_t ret = tokenizer.tokenize(output); + if (ret == 0 && std::getenv("E2VL_STATS") != nullptr) { + // Inspect chunks to verify IMG_CONTEXT replacement: we should NOT see the literal media marker any more, + // instead we have an IMAGE chunk with projected embeddings. + size_t n_chunks = mtmd_input_chunks_size(output); + for (size_t i = 0; i < n_chunks; ++i) { + auto * chunk = mtmd_input_chunks_get(output, i); + if (!chunk) continue; + if (mtmd_input_chunk_get_type(chunk) == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + // find left and right neighboring text token ids for context + llama_token left_id = -1; + llama_token right_id = -1; + // search left + for (int64_t j = (int64_t)i - 1; j >= 0; --j) { + auto * c2 = mtmd_input_chunks_get(output, j); + if (c2 && mtmd_input_chunk_get_type(c2) == MTMD_INPUT_CHUNK_TYPE_TEXT) { + size_t ntt = 0; auto * toks = mtmd_input_chunk_get_tokens_text(c2, &ntt); + if (ntt > 0) { left_id = toks[ntt - 1]; } + break; + } + } + // search right + for (size_t j = i + 1; j < n_chunks; ++j) { + auto * c2 = mtmd_input_chunks_get(output, j); + if (c2 && mtmd_input_chunk_get_type(c2) == MTMD_INPUT_CHUNK_TYPE_TEXT) { + size_t ntt = 0; auto * toks = mtmd_input_chunk_get_tokens_text(c2, &ntt); + if (ntt > 0) { right_id = toks[0]; } + break; + } + } + const mtmd_image_tokens * img = mtmd_input_chunk_get_tokens_image(chunk); + uint32_t inserted_tokens = img ? img->n_tokens() : 0; + printf("[E2VL] IMG_CONTEXT replaced: left_id=%d right_id=%d inserted_image_tokens=%u\n", + (int)left_id, (int)right_id, inserted_tokens); + } + } + } + return ret; } int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { From 7bc9c9ec60975e2f618c81025494de5ef01f786c Mon Sep 17 00:00:00 2001 From: YaelGitAccount Date: Wed, 12 Nov 2025 12:12:00 +0200 Subject: [PATCH 5/8] mtmd: remove unused debug-only variables and no-op casts --- tools/mtmd/clip.cpp | 148 +++----------------------------------------- 1 file changed, 8 insertions(+), 140 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 5e522f4bd43d8..0f64dc3fe139a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1,8 +1,4 @@ -// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission. -// NOTE: This is modified from clip.cpp only for LLaVA, -// so there might be still unnecessary artifacts hanging around -// I'll gradually clean and extend it -// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch +#include #include "clip.h" #include "clip-impl.h" #include "ggml.h" @@ -679,19 +675,7 @@ struct clip_graph { if (hparams.n_merge > 1 && (model.proj_type == PROJECTOR_TYPE_MLP || model.proj_type == PROJECTOR_TYPE_MLP_NORM)) { const int scale_factor = hparams.n_merge; - // minimal debug: pre-merge C/T - { - int C = (int) cur->ne[0]; - int T = (int) cur->ne[1]; - printf("[E2VL] pre-merge: C=%d, T=%d\n", C, T); - } cur = build_patch_merge_permute(cur, scale_factor); - // minimal debug: post-merge C/T - { - int C_new = (int) cur->ne[0]; - int T_new = (int) cur->ne[1]; - printf("[E2VL] post-merge: C=%d, T=%d\n", C_new, T_new); - } } // Use mm_0_w/mm_0_b if available (Eagle2-VL), otherwise mm_1_w/mm_1_b (standard LLaVA) @@ -708,14 +692,6 @@ struct clip_graph { // Ensure 2D and correct orientation for matmul: first_w[out,in] x cur[in, tokens] cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1]); - // minimal debug: shapes before first matmul in this block - if (first_w) { - int emb_C = (int) cur->ne[0]; - int emb_T = (int) cur->ne[1]; - int w0_in = (int) first_w->ne[0]; - int w0_out = (int) first_w->ne[1]; - printf("[E2VL] emb=[%d,%d], w0=[%d,%d]\n", emb_C, emb_T, w0_in, w0_out); - } if (first_w && first_w->ne[1] != cur->ne[0]) { LOG_WRN("%s: eagle2-mlp: dim mismatch, transposing cur: first_w[in]=%lld, cur[0]=%lld, cur[1]=%lld\n", __func__, (long long) first_w->ne[1], (long long) cur->ne[0], (long long) cur->ne[1]); @@ -1621,7 +1597,7 @@ struct clip_graph { // consume the full post-merge sequence directly; no row selection via patches embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); // Eagle2-VL patch: explicitly log that we are NOT performing any row gather (uninitialized indices avoided) - printf("[E2VL] projector: using full sequence (no row gather)\n"); + // using full sequence (no row gather) // print_tensor_info(embeddings, "embeddings"); @@ -1632,35 +1608,14 @@ struct clip_graph { // ensure contiguous before reshape/permutation in patch merge embeddings = ggml_cont(ctx0, embeddings); const int scale_factor = hparams.n_merge; - int C_before = (int) embeddings->ne[0]; - int T_before = (int) embeddings->ne[1]; - printf("[E2VL] pre-merge: C=%d, T=%d (scale_factor=%d)\n", C_before, T_before, scale_factor); embeddings = build_patch_merge_permute(embeddings, scale_factor); - int C_after = (int) embeddings->ne[0]; - int T_after = (int) embeddings->ne[1]; - printf("[E2VL] post-merge: C=%d, T=%d\n", C_after, T_after); - int expected_C = C_before * scale_factor * scale_factor; - int expected_T = T_before / (scale_factor * scale_factor); - if (C_after != expected_C || T_after != expected_T) { - printf("[E2VL] WARN: unexpected post-merge shape (possible double-merge?) got C=%d (exp %d) T=%d (exp %d)\n", - C_after, expected_C, T_after, expected_T); - } else { - printf("[E2VL] merge check: single merge confirmed (C scales by %d^2, T divides by %d^2)\n", scale_factor, scale_factor); - } } LOG_INF("%s: llava-mlp before mm_0: emb[%lld, %lld], w0[%lld, %lld]\n", __func__, (long long) embeddings->ne[0], (long long) embeddings->ne[1], (long long) model.mm_0_w->ne[0], (long long) model.mm_0_w->ne[1]); ggml_tensor * w0 = model.mm_0_w; // ggml expects w->ne[0] (in_dim) == emb->ne[0]. If loader stored [out,in], fix with transpose. - // minimal debug: shapes right before mm_0 matmul - { - int emb_C = (int) embeddings->ne[0]; - int emb_T = (int) embeddings->ne[1]; - int w0_in = (int) w0->ne[0]; - int w0_out = (int) w0->ne[1]; - printf("[E2VL] emb=[%d,%d], w0=[%d,%d]\n", emb_C, emb_T, w0_in, w0_out); - } + // shapes validated at runtime by checks below if (w0->ne[0] != embeddings->ne[0] && w0->ne[1] == embeddings->ne[0]) { LOG_WRN("%s: llava-mlp: transposing mm_0_w for mul_mat: w0[%lld, %lld] emb[%lld, %lld]", __func__, (long long) w0->ne[0], (long long) w0->ne[1], (long long) embeddings->ne[0], @@ -1681,8 +1636,6 @@ struct clip_graph { } embeddings = ggml_mul_mat(ctx0, w2, embeddings); embeddings = ggml_add(ctx0, embeddings, model.mm_2_b); - // tag for post-compute stats collection - ggml_set_name(embeddings, "e2vl_proj_out"); } } else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) { @@ -2872,7 +2825,7 @@ struct clip_model_loader { // Eagle2-VL: Load spatial merge size for patch merge get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); // minimal debug: report n_merge loaded from metadata - printf("[E2VL] n_merge=%d\n", hparams.n_merge); + (void)hparams.n_merge; } break; case PROJECTOR_TYPE_MINICPMV: @@ -3730,12 +3683,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 const size_t plane_sz = (size_t) dst.nx * (size_t) dst.ny; dst.buf.resize(3 * plane_sz); // planar RGB - bool stats_enabled = std::getenv("E2VL_STATS") != nullptr; - double ch_sum[3] = {0.0,0.0,0.0}; - double ch_min[3] = {1e9,1e9,1e9}; - double ch_max[3] = {-1e9,-1e9,-1e9}; - std::vector ch_first8[3]; - ch_first8[0].reserve(8); ch_first8[1].reserve(8); ch_first8[2].reserve(8); + // removed E2VL_STATS debug instrumentation for (int y = 0; y < dst.ny; ++y) { for (int x = 0; x < dst.nx; ++x) { @@ -3746,52 +3694,11 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 float v = (raw - mean[c]) / std[c]; size_t dst_idx = (size_t) c * plane_sz + base; // planar in dst dst.buf[dst_idx] = v; - if (stats_enabled) { - ch_sum[c] += v; - ch_min[c] = std::min(ch_min[c], v); - ch_max[c] = std::max(ch_max[c], v); - if ((int)ch_first8[c].size() < 8) ch_first8[c].push_back(v); - } + } } } - if (stats_enabled) { - const double denom = double(dst.nx * dst.ny); - double ch_mean[3] = { ch_sum[0] / denom, ch_sum[1] / denom, ch_sum[2] / denom }; - printf("[E2VL] preprocess stats (RGB channel order)\n"); - for (int c = 0; c < 3; ++c) { - printf("[E2VL] channel %d first8: ", c); - for (float v : ch_first8[c]) printf(" % .6f ", v); - printf("\n"); - printf("[E2VL] channel %d min=% .6f max=% .6f mean=% .6f\n", c, ch_min[c], ch_max[c], ch_mean[c]); - } - } - - if (std::getenv("E2VL_PRE_DUMP") != nullptr) { - const char * path = std::getenv("E2VL_PRE_CPP_OUT"); - if (!path) path = "e2vl_pre_cpp.bin"; - std::vector planar(dst.buf.size()); - for (int c = 0; c < 3; ++c) { - for (int y = 0; y < dst.ny; ++y) { - for (int x = 0; x < dst.nx; ++x) { - size_t src_idx = (size_t) c * plane_sz + (size_t) y * dst.nx + (size_t) x; - size_t dst_idx = src_idx; - planar[dst_idx] = dst.buf[src_idx]; - } - } - } - FILE * f = fopen(path, "wb"); - if (f) { - fwrite(planar.data(), sizeof(float), planar.size(), f); - fclose(f); - if (stats_enabled) { - printf("[E2VL] preprocess dump written (planar RGB) path=%s size=%zu floats\n", path, (size_t)planar.size()); - } - } else if (stats_enabled) { - printf("[E2VL] WARN: failed to open preprocess dump path %s\n", path); - } - } } // set of tools to manupulate images @@ -5175,7 +5082,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } else { // Only log in verbose contexts (llava projector present) to avoid spam for other models. if (ctx->model.hparams.has_llava_projector) { - printf("[E2VL] no 'patches' tensor in graph (full-sequence path)\n"); + // no 'patches' tensor in graph (full-sequence path) } } } break; @@ -5228,46 +5135,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return false; } - // E2VL projector output stats/dump (post-compute) - if (std::getenv("E2VL_STATS") != nullptr || std::getenv("E2VL_DUMP") != nullptr) { - ggml_tensor * proj = ggml_graph_get_tensor(gf, "e2vl_proj_out"); - if (proj != nullptr) { - const int64_t C = proj->ne[0]; - const int64_t T = proj->ne[1]; - const int64_t N = C * T; - std::vector buf((size_t) N); - ggml_backend_tensor_get(proj, buf.data(), 0, ggml_nbytes(proj)); - - long double sum = 0.0L, sq = 0.0L; - for (int64_t i = 0; i < N; ++i) { - const long double v = buf[(size_t) i]; - sum += v; - sq += v * v; - } - const long double mean = sum / (long double) N; - const long double var = std::max(0.0L, sq / (long double) N - mean * mean); - const long double stdv = sqrt((double) var); - const long double l2n = sqrt((double) sq); - if (std::getenv("E2VL_STATS") != nullptr) { - printf("[E2VL] projector out stats (after mm.2): shape=[%lld,%lld] mean=% .6Lf std=% .6Lf L2=% .6Lf\n", - (long long) C, (long long) T, mean, stdv, l2n); - } - if (std::getenv("E2VL_DUMP") != nullptr) { - const char * path = std::getenv("E2VL_CPP_OUT"); - if (!path) path = "e2vl_projector_cpp.bin"; - FILE * f = fopen(path, "wb"); - if (f) { - fwrite(buf.data(), sizeof(float), (size_t) N, f); - fclose(f); - printf("[E2VL] projector output dumped to %s (N=%lld)\n", path, (long long) N); - } else { - printf("[E2VL] WARN: failed to open dump path %s\n", path); - } - } - } else { - // Silent if not present to avoid noise on non-E2VL models - } - } + // removed E2VL projector stats/dump block // print debug nodes if (ctx->debug_graph) { From d6cb43b638a25bbe5fba6b0acd775c9fcb406133 Mon Sep 17 00:00:00 2001 From: YaelGitAccount Date: Wed, 12 Nov 2025 16:02:15 +0200 Subject: [PATCH 6/8] =?UTF-8?q?feat(mtmd):=20integrate=20Eagle2-VL=20mmpro?= =?UTF-8?q?j=20=E2=80=94=20media=20ingest,=20projector=20routing,=20and=20?= =?UTF-8?q?shape=20validation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/mtmd/clip.cpp | 39 ++++++++---------------------------- tools/mtmd/mtmd-cli.cpp | 38 +++-------------------------------- tools/mtmd/mtmd.cpp | 44 ----------------------------------------- 3 files changed, 11 insertions(+), 110 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0f64dc3fe139a..f14eafa5589be 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -682,19 +682,10 @@ struct clip_graph { ggml_tensor * first_w = model.mm_0_w ? model.mm_0_w : model.mm_1_w; ggml_tensor * first_b = model.mm_0_b ? model.mm_0_b : model.mm_1_b; - // Debug shapes before projection to catch matmul mismatches - LOG_INF("%s: eagle2-mlp: cur shape: [%lld, %lld, %lld]\n", __func__, (long long) cur->ne[0], - (long long) cur->ne[1], (long long) cur->ne[2]); - if (first_w) { - LOG_INF("%s: eagle2-mlp: first_w: [%lld, %lld]\n", __func__, (long long) first_w->ne[0], - (long long) first_w->ne[1]); - } - // Ensure 2D and correct orientation for matmul: first_w[out,in] x cur[in, tokens] cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1]); if (first_w && first_w->ne[1] != cur->ne[0]) { - LOG_WRN("%s: eagle2-mlp: dim mismatch, transposing cur: first_w[in]=%lld, cur[0]=%lld, cur[1]=%lld\n", - __func__, (long long) first_w->ne[1], (long long) cur->ne[0], (long long) cur->ne[1]); + // transpose to match expected [in, tokens] cur = ggml_transpose(ctx0, cur); cur = ggml_cont(ctx0, cur); } @@ -1596,30 +1587,20 @@ struct clip_graph { if (ctx->model.hparams.has_llava_projector) { // consume the full post-merge sequence directly; no row selection via patches embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); - // Eagle2-VL patch: explicitly log that we are NOT performing any row gather (uninitialized indices avoided) - // using full sequence (no row gather) - - // print_tensor_info(embeddings, "embeddings"); + // llava projector if (ctx->proj_type() == PROJECTOR_TYPE_MLP) { - // Eagle2-VL: apply 2x2 patch merge on [C, T] layout directly when n_merge > 1 + // apply 2x2 patch merge on [C, T] layout directly when n_merge > 1 if (hparams.n_merge > 1) { // ensure contiguous before reshape/permutation in patch merge embeddings = ggml_cont(ctx0, embeddings); const int scale_factor = hparams.n_merge; embeddings = build_patch_merge_permute(embeddings, scale_factor); } - LOG_INF("%s: llava-mlp before mm_0: emb[%lld, %lld], w0[%lld, %lld]\n", __func__, - (long long) embeddings->ne[0], (long long) embeddings->ne[1], (long long) model.mm_0_w->ne[0], - (long long) model.mm_0_w->ne[1]); ggml_tensor * w0 = model.mm_0_w; - // ggml expects w->ne[0] (in_dim) == emb->ne[0]. If loader stored [out,in], fix with transpose. - // shapes validated at runtime by checks below + // ensure projector weight orientation matches embeddings if (w0->ne[0] != embeddings->ne[0] && w0->ne[1] == embeddings->ne[0]) { - LOG_WRN("%s: llava-mlp: transposing mm_0_w for mul_mat: w0[%lld, %lld] emb[%lld, %lld]", __func__, - (long long) w0->ne[0], (long long) w0->ne[1], (long long) embeddings->ne[0], - (long long) embeddings->ne[1]); w0 = ggml_cont(ctx0, ggml_transpose(ctx0, w0)); } embeddings = ggml_mul_mat(ctx0, w0, embeddings); @@ -1627,11 +1608,8 @@ struct clip_graph { embeddings = ggml_gelu(ctx0, embeddings); if (model.mm_2_w) { - ggml_tensor * w2 = model.mm_2_w; + ggml_tensor * w2 = model.mm_2_w; if (w2->ne[0] != embeddings->ne[0] && w2->ne[1] == embeddings->ne[0]) { - LOG_WRN("%s: llava-mlp: transposing mm_2_w for mul_mat: w2[%lld, %lld] emb[%lld, %lld]", - __func__, (long long) w2->ne[0], (long long) w2->ne[1], (long long) embeddings->ne[0], - (long long) embeddings->ne[1]); w2 = ggml_cont(ctx0, ggml_transpose(ctx0, w2)); } embeddings = ggml_mul_mat(ctx0, w2, embeddings); @@ -2824,8 +2802,7 @@ struct clip_model_loader { { // Eagle2-VL: Load spatial merge size for patch merge get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); - // minimal debug: report n_merge loaded from metadata - (void)hparams.n_merge; + (void)hparams.n_merge; // keep variable referenced even if unused } break; case PROJECTOR_TYPE_MINICPMV: @@ -3683,7 +3660,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 const size_t plane_sz = (size_t) dst.nx * (size_t) dst.ny; dst.buf.resize(3 * plane_sz); // planar RGB - // removed E2VL_STATS debug instrumentation + for (int y = 0; y < dst.ny; ++y) { for (int x = 0; x < dst.nx; ++x) { @@ -5135,7 +5112,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima return false; } - // removed E2VL projector stats/dump block + // print debug nodes if (ctx->debug_graph) { diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index f7134d8a3f93d..d0c9c961cc156 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -1,4 +1,3 @@ -// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission. #include "arg.h" #include "log.h" #include "common.h" @@ -180,33 +179,7 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { generated_tokens.push_back(token_id); common_sampler_accept(ctx.smpl, token_id, true); - if (i == 0 && std::getenv("E2VL_STATS") != nullptr) { - // Dump top-10 logits used to sample the first generated token. - // Prefer llama_get_logits() over llama_get_logits_ith() since the latter may not be populated - // when using the helper chunk evaluation path. - const struct llama_vocab * v = llama_model_get_vocab(ctx.model); - const int n_vocab = llama_vocab_n_tokens(v); - const float * logits = llama_get_logits(ctx.lctx); - if (!logits) { - // fall back to ith accessor - logits = llama_get_logits_ith(ctx.lctx, 0); - } - if (logits) { - struct Item { int id; float logit; }; std::vector items; items.reserve(n_vocab); - for (int t = 0; t < n_vocab; ++t) items.push_back({t, logits[t]}); - std::partial_sort(items.begin(), items.begin()+std::min(10, items.size()), items.end(), [](const Item & a, const Item & b){return a.logit > b.logit;}); - printf("[E2VL] first-token top10 logits:\n"); - float denom = 0.0f; for (size_t j = 0; j < 10 && j < items.size(); ++j) denom += expf(items[j].logit - items[0].logit); - for (size_t k = 0; k < 10 && k < items.size(); ++k) { - auto & it = items[k]; - std::string piece = common_token_to_piece(ctx.lctx, it.id); - float prob = expf(it.logit - items[0].logit) / (denom > 0 ? denom : 1); - printf(" id=%d piece='%s' logit=% .5f approx_prob=% .5f\n", it.id, piece.c_str(), it.logit, prob); - } - } else { - printf("[E2VL] WARN: logits unavailable for first-token probe (no logits pointer)\n"); - } - } + if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) { LOG("\n"); @@ -240,8 +213,7 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { } static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) { - LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n", - new_msg.role.c_str(), new_msg.content.c_str()); + // format and append message auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history, new_msg, new_msg.role == "user", ctx.use_jinja); @@ -252,7 +224,6 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { bool add_bos = ctx.chat_history.empty(); auto formatted_chat = chat_add_and_format(ctx, msg); - LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); mtmd_input_text text; text.text = formatted_chat.c_str(); @@ -340,11 +311,8 @@ int main(int argc, char ** argv) { if (is_single_turn) { g_is_generating = true; - // TEMP: Guard against duplicate vision markers (safe for now, to be removed after EAGLE2_VL stabilization) - // Avoid auto-appending a media marker if the prompt already contains an IMG_CONTEXT placeholder const bool has_default_media_marker = params.prompt.find(mtmd_default_marker()) != std::string::npos; - const bool has_img_context_placeholder = params.prompt.find("") != std::string::npos; - if (!has_default_media_marker && !has_img_context_placeholder) { + if (!has_default_media_marker) { for (size_t i = 0; i < params.image.size(); i++) { params.prompt += mtmd_default_marker(); } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index ac499e0ff0403..e7ed5593aa690 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1,4 +1,3 @@ -// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission. #include "clip.h" #include "clip-impl.h" #include "mtmd.h" @@ -480,7 +479,6 @@ struct mtmd_tokenizer { } void add_text(const std::string & txt, bool parse_special) { - LOG_DBG("%s: %s\n", __func__, txt.c_str()); auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special); add_text(tokens); } @@ -614,10 +612,6 @@ struct mtmd_tokenizer { image_tokens->batch_f32 = std::move(batch_f32); image_tokens->id = bitmap->id; // optional - LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx); - LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); - LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size()); - mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_IMAGE, {}, // text tokens @@ -677,8 +671,6 @@ struct mtmd_tokenizer { audio_tokens->batch_f32 = std::move(batch_f32); audio_tokens->id = bitmap->id; // optional - LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens); - mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_AUDIO, {}, // text tokens @@ -767,42 +759,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx, size_t n_bitmaps) { mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps); int32_t ret = tokenizer.tokenize(output); - if (ret == 0 && std::getenv("E2VL_STATS") != nullptr) { - // Inspect chunks to verify IMG_CONTEXT replacement: we should NOT see the literal media marker any more, - // instead we have an IMAGE chunk with projected embeddings. - size_t n_chunks = mtmd_input_chunks_size(output); - for (size_t i = 0; i < n_chunks; ++i) { - auto * chunk = mtmd_input_chunks_get(output, i); - if (!chunk) continue; - if (mtmd_input_chunk_get_type(chunk) == MTMD_INPUT_CHUNK_TYPE_IMAGE) { - // find left and right neighboring text token ids for context - llama_token left_id = -1; - llama_token right_id = -1; - // search left - for (int64_t j = (int64_t)i - 1; j >= 0; --j) { - auto * c2 = mtmd_input_chunks_get(output, j); - if (c2 && mtmd_input_chunk_get_type(c2) == MTMD_INPUT_CHUNK_TYPE_TEXT) { - size_t ntt = 0; auto * toks = mtmd_input_chunk_get_tokens_text(c2, &ntt); - if (ntt > 0) { left_id = toks[ntt - 1]; } - break; - } - } - // search right - for (size_t j = i + 1; j < n_chunks; ++j) { - auto * c2 = mtmd_input_chunks_get(output, j); - if (c2 && mtmd_input_chunk_get_type(c2) == MTMD_INPUT_CHUNK_TYPE_TEXT) { - size_t ntt = 0; auto * toks = mtmd_input_chunk_get_tokens_text(c2, &ntt); - if (ntt > 0) { right_id = toks[0]; } - break; - } - } - const mtmd_image_tokens * img = mtmd_input_chunk_get_tokens_image(chunk); - uint32_t inserted_tokens = img ? img->n_tokens() : 0; - printf("[E2VL] IMG_CONTEXT replaced: left_id=%d right_id=%d inserted_image_tokens=%u\n", - (int)left_id, (int)right_id, inserted_tokens); - } - } - } return ret; } From 0fcfb37dc5ab7b9f66231a5a971ff16d5c00448d Mon Sep 17 00:00:00 2001 From: YaelGitAccount Date: Wed, 12 Nov 2025 18:28:10 +0200 Subject: [PATCH 7/8] =?UTF-8?q?feat(mtmd):=20integrate=20Eagle2-VL=20mmpro?= =?UTF-8?q?j=20=E2=80=94=20media=20ingest,=20projector=20routing,=20and=20?= =?UTF-8?q?shape=20validation\n\n-=20Scope=20strictly=20to=20Eagle2-VL=20w?= =?UTF-8?q?ith=20config/arch=20guards\n-=20Remove=20temporary=20debug=20lo?= =?UTF-8?q?gs;=20preserve=20upstream=20logging=20semantics\n-=20Keep=20tok?= =?UTF-8?q?enizer/encode=20path=20clean;=20no=20behavior=20change=20for=20?= =?UTF-8?q?non-vision=20models?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/mtmd/hf_preprocess_dump.py | 42 -------------------------------- 1 file changed, 42 deletions(-) delete mode 100755 tools/mtmd/hf_preprocess_dump.py diff --git a/tools/mtmd/hf_preprocess_dump.py b/tools/mtmd/hf_preprocess_dump.py deleted file mode 100755 index a70efe81cc233..0000000000000 --- a/tools/mtmd/hf_preprocess_dump.py +++ /dev/null @@ -1,42 +0,0 @@ -#!/usr/bin/env python3 -""" -hf_preprocess_dump.py -Create a planar C,H,W float32 dump matching the C++ E2VL preprocessing (resize->center crop->normalize SigLIP mean=0.5 std=0.5). -Usage: python3 hf_preprocess_dump.py input.jpg out.bin -""" -import sys -from PIL import Image -import numpy as np - -if len(sys.argv) < 3: - print("usage: hf_preprocess_dump.py INPUT_IMAGE OUT_BIN") - sys.exit(2) - -inp = sys.argv[1] -out = sys.argv[2] - -image_size = 448 -mean = np.array([0.5, 0.5, 0.5], dtype=np.float32) -std = np.array([0.5, 0.5, 0.5], dtype=np.float32) - -img = Image.open(inp).convert('RGB') -# Resize short side to image_size, keep aspect, then center crop -w, h = img.size -if w < h: - new_w = image_size - new_h = int(round(h * (image_size / w))) -else: - new_h = image_size - new_w = int(round(w * (image_size / h))) -img = img.resize((new_w, new_h), resample=Image.BICUBIC) -# center crop -left = (new_w - image_size) // 2 -top = (new_h - image_size) // 2 -img = img.crop((left, top, left + image_size, top + image_size)) -arr = np.array(img).astype(np.float32) / 255.0 -# Normalize (SigLIP style): (x - mean)/std -arr = (arr - mean) / std -# Convert to planar C,H,W -planar = np.transpose(arr, (2,0,1)).astype(np.float32) -planar.tofile(out) -print(f"wrote {out}: shape={planar.shape}, bytes={planar.nbytes}") From f83bb7b6583a59d1536a8c5d94e349a223ee1248 Mon Sep 17 00:00:00 2001 From: YaelGitAccount Date: Wed, 12 Nov 2025 23:28:35 +0200 Subject: [PATCH 8/8] chore(mtmd): revert mtmd{,-cli}.cpp to upstream/master; keep Eagle2-VL logic isolated in clip.cpp --- tools/mtmd/mtmd-cli.cpp | 9 ++++---- tools/mtmd/mtmd.cpp | 47 +++++++++++++++++++++++------------------ 2 files changed, 31 insertions(+), 25 deletions(-) diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index d0c9c961cc156..3e19e95958a2f 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -179,8 +179,6 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { generated_tokens.push_back(token_id); common_sampler_accept(ctx.smpl, token_id, true); - - if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) { LOG("\n"); break; // end of generation @@ -213,7 +211,8 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) { } static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) { - // format and append message + LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n", + new_msg.role.c_str(), new_msg.content.c_str()); auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history, new_msg, new_msg.role == "user", ctx.use_jinja); @@ -224,6 +223,7 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { bool add_bos = ctx.chat_history.empty(); auto formatted_chat = chat_add_and_format(ctx, msg); + LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); mtmd_input_text text; text.text = formatted_chat.c_str(); @@ -311,8 +311,7 @@ int main(int argc, char ** argv) { if (is_single_turn) { g_is_generating = true; - const bool has_default_media_marker = params.prompt.find(mtmd_default_marker()) != std::string::npos; - if (!has_default_media_marker) { + if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { for (size_t i = 0; i < params.image.size(); i++) { params.prompt += mtmd_default_marker(); } diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index e7ed5593aa690..e599137769963 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -101,16 +101,17 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_ } mtmd_context_params mtmd_context_params_default() { - mtmd_context_params params; - params.use_gpu = true; - params.print_timings = true; - params.n_threads = 4; - params.verbosity = GGML_LOG_LEVEL_INFO; - params.image_marker = MTMD_DEFAULT_IMAGE_MARKER; - params.media_marker = mtmd_default_marker(); - params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; - params.image_min_tokens = -1; - params.image_max_tokens = -1; + mtmd_context_params params { + /* use_gpu */ true, + /* print_timings */ true, + /* n_threads */ 4, + /* verbosity */ GGML_LOG_LEVEL_INFO, + /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER, + /* media_marker */ mtmd_default_marker(), + /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO, + /* image_min_tokens */ -1, + /* image_max_tokens */ -1, + }; return params; } @@ -162,7 +163,7 @@ struct mtmd_context { print_timings(ctx_params.print_timings), n_threads (ctx_params.n_threads), media_marker (ctx_params.media_marker), - n_embd_text (llama_model_n_embd(text_model)) + n_embd_text (llama_model_n_embd_inp(text_model)) { if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); @@ -172,13 +173,13 @@ struct mtmd_context { throw std::runtime_error("media_marker must not be empty"); } - clip_context_params ctx_clip_params; - ctx_clip_params.use_gpu = ctx_params.use_gpu; - ctx_clip_params.verbosity = ctx_params.verbosity; - ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type); - // custom image token limits - ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens; - ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens; + clip_context_params ctx_clip_params { + /* use_gpu */ ctx_params.use_gpu, + /* verbosity */ ctx_params.verbosity, + /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO, + /* image_min_tokens */ ctx_params.image_min_tokens, + /* image_max_tokens */ ctx_params.image_max_tokens, + }; auto res = clip_init(mmproj_fname, ctx_clip_params); ctx_v = res.ctx_v; @@ -479,6 +480,7 @@ struct mtmd_tokenizer { } void add_text(const std::string & txt, bool parse_special) { + LOG_DBG("%s: %s\n", __func__, txt.c_str()); auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special); add_text(tokens); } @@ -612,6 +614,10 @@ struct mtmd_tokenizer { image_tokens->batch_f32 = std::move(batch_f32); image_tokens->id = bitmap->id; // optional + LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx); + LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny); + LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size()); + mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_IMAGE, {}, // text tokens @@ -671,6 +677,8 @@ struct mtmd_tokenizer { audio_tokens->batch_f32 = std::move(batch_f32); audio_tokens->id = bitmap->id; // optional + LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens); + mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_AUDIO, {}, // text tokens @@ -758,8 +766,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx, const mtmd_bitmap ** bitmaps, size_t n_bitmaps) { mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps); - int32_t ret = tokenizer.tokenize(output); - return ret; + return tokenizer.tokenize(output); } int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {