From 466f909b1e3953e7adc51a30be52e1b2a59552db Mon Sep 17 00:00:00 2001
From: YaelGitAccount <Ya0504124870@gmail.com>
Date: Mon, 3 Nov 2025 16:05:51 +0200
Subject: [PATCH 1/8] feat(gguf): add Eagle2-VL mmproj support (register arch +
 model_type branch)

Co-authored-by: YaelLogic <y0548591250@gmail.com>
---
 convert_hf_to_gguf.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index c6f5ba6a04c54..cfece431dda5e 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3577,7 +3577,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+@ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration", "Eagle2_5_VLForConditionalGeneration")
 class Qwen2VLVisionModel(MmprojModel):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -3612,6 +3612,9 @@ def set_gguf_parameters(self):
                 if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
                     raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
             self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
+        elif model_type in ['eagle_2_5_vl', 'eagle2_vl', 'eagle2_5_vl']:
+            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
+            self.gguf_writer.add_vision_use_silu(True)
         else:
             raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
         # default values below are taken from HF tranformers code

From 9a2e7b4bdf97755d0492444606b7e394ffd42aa6 Mon Sep 17 00:00:00 2001
From: Yael Logic <y0548591250@gmail.com>
Date: Thu, 6 Nov 2025 16:12:44 +0200
Subject: [PATCH 2/8] tools/mtmd/clip.cpp: Eagle2_VL mmproj fix

Co-authored-by: YaelGitAccount <ya0504124870@gmail.com>
---
 tools/mtmd/clip.cpp | 165 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 155 insertions(+), 10 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 99775cb3e351c..c65a6bab9aafa 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -12,6 +12,7 @@
 
 #include <cassert>
 #include <cmath>
+#include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
@@ -185,6 +186,11 @@ struct clip_hparams {
 
     patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
 
+    int32_t patch_merge_factor = 1;      
+    std::string patch_merge_mode = "flat";  
+    int32_t grid_h = 0;      
+    int32_t grid_w = 0; 
+
     float eps = 1e-6;
     float rope_theta = 0.0;
 
@@ -667,9 +673,57 @@ struct clip_graph {
 
         // LlavaMultiModalProjector (always using GELU activation)
         {
-            cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
-            if (model.mm_1_b) {
-                cur = ggml_add(ctx0, cur, model.mm_1_b);
+            // Eagle2-VL: Apply patch merge before MLP projection if n_merge > 1
+            // Prefer clip.vision.spatial_merge_size; treat n_merge==1 as no-merge
+            if (hparams.n_merge > 1 &&
+                (model.proj_type == PROJECTOR_TYPE_MLP || model.proj_type == PROJECTOR_TYPE_MLP_NORM)) {
+                const int scale_factor = hparams.n_merge;
+                // minimal debug: pre-merge C/T
+                {
+                    int C = (int) cur->ne[0];
+                    int T = (int) cur->ne[1];
+                    printf("[E2VL] pre-merge: C=%d, T=%d\n", C, T);
+                }
+                cur = build_patch_merge_permute(cur, scale_factor);
+                // minimal debug: post-merge C/T
+                {
+                    int C_new = (int) cur->ne[0];
+                    int T_new = (int) cur->ne[1];
+                    printf("[E2VL] post-merge: C=%d, T=%d\n", C_new, T_new);
+                }
+            }
+
+            // Use mm_0_w/mm_0_b if available (Eagle2-VL), otherwise mm_1_w/mm_1_b (standard LLaVA)
+            ggml_tensor * first_w = model.mm_0_w ? model.mm_0_w : model.mm_1_w;
+            ggml_tensor * first_b = model.mm_0_b ? model.mm_0_b : model.mm_1_b;
+
+            // Debug shapes before projection to catch matmul mismatches
+            LOG_INF("%s: eagle2-mlp: cur shape:    [%lld, %lld, %lld]\n", __func__, (long long) cur->ne[0],
+                    (long long) cur->ne[1], (long long) cur->ne[2]);
+            if (first_w) {
+                LOG_INF("%s: eagle2-mlp: first_w:     [%lld, %lld]\n", __func__, (long long) first_w->ne[0],
+                        (long long) first_w->ne[1]);
+            }
+
+            // Ensure 2D and correct orientation for matmul: first_w[out,in] x cur[in, tokens]
+            cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1]);
+            // minimal debug: shapes before first matmul in this block
+            if (first_w) {
+                int emb_C  = (int) cur->ne[0];
+                int emb_T  = (int) cur->ne[1];
+                int w0_in  = (int) first_w->ne[0];
+                int w0_out = (int) first_w->ne[1];
+                printf("[E2VL] emb=[%d,%d], w0=[%d,%d]\n", emb_C, emb_T, w0_in, w0_out);
+            }
+            if (first_w && first_w->ne[1] != cur->ne[0]) {
+                LOG_WRN("%s: eagle2-mlp: dim mismatch, transposing cur: first_w[in]=%lld, cur[0]=%lld, cur[1]=%lld\n",
+                        __func__, (long long) first_w->ne[1], (long long) cur->ne[0], (long long) cur->ne[1]);
+                cur = ggml_transpose(ctx0, cur);
+                cur = ggml_cont(ctx0, cur);
+            }
+            cur = ggml_mul_mat(ctx0, first_w, cur);
+            if (first_b) {
+                cur = ggml_add(ctx0, cur, first_b);
             }
 
             cur = ggml_gelu(ctx0, cur);
@@ -686,8 +740,8 @@ struct clip_graph {
             // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
             // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows]
 
-            const int p_y             = n_merge > 0 ? n_patches_y / n_merge : n_patches_y;
-            const int p_x             = n_merge > 0 ? n_patches_x / n_merge : n_patches_x;
+            const int p_y             = n_merge > 1 ? n_patches_y / n_merge : n_patches_y;
+            const int p_x             = n_merge > 1 ? n_patches_x / n_merge : n_patches_x;
             const int p_total         = p_x * p_y;
             const int n_embd_text     = cur->ne[0];
             const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row
@@ -710,7 +764,7 @@ struct clip_graph {
 
     // Qwen2VL and Qwen2.5VL use M-RoPE
     ggml_cgraph * build_qwen2vl() {
-        GGML_ASSERT(model.patch_bias == nullptr);
+        // Eagle2-VL and some variants may have patch bias
         GGML_ASSERT(model.class_embedding == nullptr);
 
         const int batch_size       = 1;
@@ -749,6 +803,12 @@ struct clip_graph {
                 n_embd, n_patches_x * n_patches_y, batch_size);
         }
 
+        // add patch bias if present (Eagle2-VL has patch bias)
+        if (model.patch_bias != nullptr) {
+            inp = ggml_add(ctx0, inp, model.patch_bias);
+            cb(inp, "patch_bias", -1);
+        }
+
         ggml_tensor * inpL           = inp;
         ggml_tensor * window_mask    = nullptr;
         ggml_tensor * window_idx     = nullptr;
@@ -867,10 +927,30 @@ struct clip_graph {
             inpL = build_norm(inpL, model.post_ln_w, model.post_ln_b, norm_t, eps, n_layer);
         }
 
+                // Apply patch merge based on metadata
+        // Preferred: clip.vision.spatial_merge_size -> hparams.n_merge
+        // Fallback: legacy keys clip.vision.patch_merge_factor/mode
+        bool did_spatial_merge = false;
+        if (hparams.n_merge > 1) {
+            // e.g. [1152, 1024] -> [1152 * n_merge^2, 1024 / n_merge^2]
+            inpL              = build_patch_merge_permute(inpL, hparams.n_merge);
+            did_spatial_merge = true;
+        } else if (hparams.patch_merge_factor > 1 &&
+                   (hparams.patch_merge_mode == "concat2x2" || hparams.patch_merge_mode == "concat")) {
+            // legacy fallback (kept for backward compatibility)
+            inpL              = build_patch_merge_permute(inpL, hparams.patch_merge_factor);
+            did_spatial_merge = true;
+        }
+
         // multimodal projection
         ggml_tensor * embeddings = inpL;
-        embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
 
+                // Conditional reshape based on whether patch merge was applied
+        if (!did_spatial_merge) {
+            // Standard Qwen2VL path assumes 2x2 merge semantics without explicit permute
+            embeddings = ggml_reshape_3d(ctx0, embeddings, n_embd * 4, n_pos / 4, batch_size);
+        }
+        
         embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
         embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
 
@@ -1551,12 +1631,57 @@ struct clip_graph {
 
             // llava projector
             if (ctx->proj_type() == PROJECTOR_TYPE_MLP) {
-                embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+                 // Eagle2-VL: apply 2x2 patch merge on [C, T] layout directly when n_merge > 1
+                if (hparams.n_merge > 1) {
+                    // ensure contiguous before reshape/permutation in patch merge
+                    embeddings             = ggml_cont(ctx0, embeddings);
+                    const int scale_factor = hparams.n_merge;
+                    // minimal debug: pre-merge C/T (embeddings)
+                    {
+                        int C = (int) embeddings->ne[0];
+                        int T = (int) embeddings->ne[1];
+                        printf("[E2VL] pre-merge: C=%d, T=%d\n", C, T);
+                    }
+                    embeddings = build_patch_merge_permute(embeddings, scale_factor);
+                    // minimal debug: post-merge C/T (embeddings)
+                    {
+                        int C_new = (int) embeddings->ne[0];
+                        int T_new = (int) embeddings->ne[1];
+                        printf("[E2VL] post-merge: C=%d, T=%d\n", C_new, T_new);
+                    }
+                }
+                LOG_INF("%s: llava-mlp before mm_0: emb[%lld, %lld], w0[%lld, %lld]\n", __func__,
+                        (long long) embeddings->ne[0], (long long) embeddings->ne[1], (long long) model.mm_0_w->ne[0],
+                        (long long) model.mm_0_w->ne[1]);
+                ggml_tensor * w0 = model.mm_0_w;
+                // ggml expects w->ne[0] (in_dim) == emb->ne[0]. If loader stored [out,in], fix with transpose.
+                // minimal debug: shapes right before mm_0 matmul
+                {
+                    int emb_C  = (int) embeddings->ne[0];
+                    int emb_T  = (int) embeddings->ne[1];
+                    int w0_in  = (int) w0->ne[0];
+                    int w0_out = (int) w0->ne[1];
+                    printf("[E2VL] emb=[%d,%d], w0=[%d,%d]\n", emb_C, emb_T, w0_in, w0_out);
+                }
+                if (w0->ne[0] != embeddings->ne[0] && w0->ne[1] == embeddings->ne[0]) {
+                    LOG_WRN("%s: llava-mlp: transposing mm_0_w for mul_mat: w0[%lld, %lld] emb[%lld, %lld]", __func__,
+                            (long long) w0->ne[0], (long long) w0->ne[1], (long long) embeddings->ne[0],
+                            (long long) embeddings->ne[1]);
+                    w0 = ggml_cont(ctx0, ggml_transpose(ctx0, w0));
+                }
+                embeddings = ggml_mul_mat(ctx0, w0, embeddings);
                 embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
 
                 embeddings = ggml_gelu(ctx0, embeddings);
                 if (model.mm_2_w) {
-                    embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+                     ggml_tensor * w2 = model.mm_2_w;
+                    if (w2->ne[0] != embeddings->ne[0] && w2->ne[1] == embeddings->ne[0]) {
+                        LOG_WRN("%s: llava-mlp: transposing mm_2_w for mul_mat: w2[%lld, %lld] emb[%lld, %lld]",
+                                __func__, (long long) w2->ne[0], (long long) w2->ne[1], (long long) embeddings->ne[0],
+                                (long long) embeddings->ne[1]);
+                        w2 = ggml_cont(ctx0, ggml_transpose(ctx0, w2));
+                    }
+                    embeddings = ggml_mul_mat(ctx0, w2, embeddings);
                     embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
                 }
             }
@@ -2706,6 +2831,12 @@ struct clip_model_loader {
                 if (mm_patch_merge_type == "spatial_unpad") {
                     hparams.mm_patch_merge_type = PATCH_MERGE_SPATIAL_UNPAD;
                 }
+
+                 // Load Eagle2-VL specific patch merge metadata
+                get_i32("clip.vision.patch_merge_factor", hparams.patch_merge_factor, false);
+                get_string("clip.vision.patch_merge_mode", hparams.patch_merge_mode, false);
+                get_i32("clip.vision.grid_h", hparams.grid_h, false);
+                get_i32("clip.vision.grid_w", hparams.grid_w, false);
             }
 
             if (is_vision) {
@@ -2735,6 +2866,15 @@ struct clip_model_loader {
 
             // model-specific params
             switch (model.proj_type) {
+                case PROJECTOR_TYPE_MLP:
+                case PROJECTOR_TYPE_MLP_NORM:
+                    {
+                        // Eagle2-VL: Load spatial merge size for patch merge
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        // minimal debug: report n_merge loaded from metadata
+                        printf("[E2VL] n_merge=%d\n", hparams.n_merge);
+                    }
+                    break;
                 case PROJECTOR_TYPE_MINICPMV:
                     {
                         if (hparams.minicpmv_version == 0) {
@@ -4462,7 +4602,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
         case PROJECTOR_TYPE_MLP_NORM:
         case PROJECTOR_TYPE_JANUS_PRO:
             {
-                // do nothing
+                // account for spatial patch merge when present (e.g., Eagle2-VL)
+                // both X and Y are downscaled by the merge factor
+                const int scale_factor = ctx->model.hparams.n_merge;
+                if (scale_factor > 0) {
+                    n_patches /= (scale_factor * scale_factor);
+                }
             } break;
         case PROJECTOR_TYPE_LDP:
         case PROJECTOR_TYPE_LDPV2:

From c5be21579f6797bdb831acfd7cb93b066ff757ce Mon Sep 17 00:00:00 2001
From: Yael Logic <y0548591250@gmail.com>
Date: Thu, 6 Nov 2025 16:24:29 +0200
Subject: [PATCH 3/8] convert_hf_to_gguf.py: Eagle2_VL support update

Co-authored-by: YaelGitAccount <ya0504124870@gmail.com>
---
 convert_hf_to_gguf.py | 95 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 6 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index cfece431dda5e..f988ed3f12540 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -3583,9 +3583,11 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         assert self.hparams_vision is not None
         self.hparams_vision["image_size"] = self.hparams_vision.get("image_size", 560)
-        # rename config.json values
-        self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
-        self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
+        # rename config.json values for Qwen models
+        if self.hparams_vision.get("num_heads") is not None:
+            self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_heads")
+        if self.hparams_vision.get("depth") is not None:
+            self.hparams_vision["num_hidden_layers"] = self.hparams_vision.get("depth")
         if "embed_dim" in self.hparams_vision: # qwen2vl
             self.hparams_vision["intermediate_size"] = self.hparams_vision.get("hidden_size")
             self.hparams_vision["hidden_size"] = self.hparams_vision.get("embed_dim")
@@ -3613,8 +3615,43 @@ def set_gguf_parameters(self):
                     raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
             self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
         elif model_type in ['eagle_2_5_vl', 'eagle2_vl', 'eagle2_5_vl']:
-            self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.QWEN25VL)
-            self.gguf_writer.add_vision_use_silu(True)
+            # Eagle2-VL uses MLP projector with 2x2 patch merge
+            # Structure: Vision encoder → 2x2 patch merge → LayerNorm → Linear → GELU → Linear
+            self.gguf_writer.add_clip_projector_type("mlp")
+            
+            # Add spatial_merge_size for patch merge (stored as n_merge in hparams)
+            self.gguf_writer.add_vision_spatial_merge_size(2)
+            
+            # Add grid dimensions for runtime to calculate merge
+            image_size = self.find_vparam(["image_size"])
+            patch_size = self.find_vparam(["patch_size"])
+            grid_h = grid_w = image_size // patch_size
+            self.gguf_writer.add_key_value("clip.vision.grid_h", grid_h, gguf.GGUFValueType.INT32)
+            self.gguf_writer.add_key_value("clip.vision.grid_w", grid_w, gguf.GGUFValueType.INT32)
+            
+            # Eagle2-VL uses window attention similar to Qwen2.5-VL but doesn't have fullatt_block_indexes
+            # Set a reasonable default window attention pattern (every 4th layer uses full attention)
+            n_wa_pattern = 4  # Default value for Eagle2-VL based on similar models
+            self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
+            
+            # --- BEGIN: Eagle2 fallback for required vision metadata ---
+            assert self.hparams_vision is not None
+            hv = self.hparams_vision
+            # block_count (num of vision layers) fallback - check original vision_config first
+            blk = hv.get('num_hidden_layers') or hv.get('num_layers') or hv.get('n_layers')
+            if blk is None:
+                # Try to get from original vision_config before any transformations
+                original_vision_config = self.global_config.get('vision_config', {})
+                blk = original_vision_config.get('num_hidden_layers') or original_vision_config.get('num_layers') or original_vision_config.get('n_layers')
+            if blk is None:
+                # As a last resort, try to infer from config layout if present
+                # (keep it simple: raise with a clear message if still missing)
+                raise ValueError("Eagle2: missing vision block count (num_hidden_layers/num_layers/n_layers) in vision_config")
+            self.gguf_writer.add_vision_block_count(int(blk))
+            # (Optional) You can add other explicit fallbacks here only if they also turn out None later:
+            # head_count = hv.get('num_attention_heads', hv.get('num_heads'))
+            # if head_count is not None: self.gguf_writer.add_vision_head_count(int(head_count))
+            # --- END: Eagle2 fallback ---
         else:
             raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
         # default values below are taken from HF tranformers code
@@ -3627,8 +3664,47 @@ def tensor_force_quant(self, name, new_name, bid, n_dims):
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
-        if name.startswith("visual."):
+        if name.startswith("visual.") or name.startswith("vision_model.") or name.startswith("mlp1."):
+            # Skip all vision model head layers - not needed for mmproj
+            if ".head." in name:
+                return []
+                
+            # Handle projector tensors (Eagle2-VL uses mlp1.N.weight/bias pattern)
+            if name.startswith("mlp1."):
+                # Eagle2-VL has: LayerNorm(0) → Linear(1) → GELU(2) → Linear(3)
+                # QWEN2VL projector expects: Linear(0) → GELU → Linear(2)
+                # So we need to remap: mlp1.1 → mm.0, mlp1.3 → mm.2
+                # Skip mlp1.0 (LayerNorm) as it's not used by QWEN2VL projector type
+                if ".0." in name:
+                    # Skip LayerNorm layer
+                    return []
+                elif ".1." in name:
+                    # Map first Linear layer (mlp1.1) to mm.0
+                    # Original: [896, 4608] -> Need to transpose for GGML: [4608, 896]
+                    if ".weight" in name:
+                        new_name = name.replace("mlp1.1.", "mm.0.")
+                        return [(new_name, data_torch.T)]  # Transpose the weight
+                    else:
+                        new_name = name.replace("mlp1.1.", "mm.0.")
+                        return [(new_name, data_torch)]
+                elif ".3." in name:
+                    # Map second Linear layer (mlp1.3) to mm.2
+                    # Original: [896, 896] -> Need to transpose for GGML: [896, 896] (square matrix)
+                    if ".weight" in name:
+                        new_name = name.replace("mlp1.3.", "mm.2.")
+                        return [(new_name, data_torch.T)]  # Transpose the weight
+                    else:
+                        new_name = name.replace("mlp1.3.", "mm.2.")
+                        return [(new_name, data_torch)]
+                else:
+                    # Unknown mlp1 layer
+                    return []
+                
             # process visual tensors
+            # Handle Eagle2-VL specific naming: vision_model.vision_model.* -> model.vision_model.*
+            if name.startswith("vision_model.vision_model."):
+                name = name.replace("vision_model.vision_model.", "model.vision_model.")
+            
             # split QKV tensors if needed
             if ".qkv." in name:
                 if data_torch.ndim == 2: # weight
@@ -3656,6 +3732,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 ]
             else:
                 return [(self.map_tensor_name(name), data_torch)]
+        elif name.startswith("multi_modal_projector."):
+            # Handle projector tensors (for other Qwen2.5-VL models that use multi_modal_projector prefix)
+            # Convert mm.model.mlp.N.weight/bias to mm.N.weight/bias pattern  
+            new_name = name.replace("multi_modal_projector.", "")
+            if "mm.model.mlp." in new_name:
+                new_name = new_name.replace("mm.model.mlp.", "mm.")
+            return [(new_name, data_torch)]
         return [] # skip other tensors
 
 

From 14c56d703e8e260ad2b0c5ebafbc0bafa5d16b98 Mon Sep 17 00:00:00 2001
From: YaelGitAccount <Ya0504124870@gmail.com>
Date: Wed, 12 Nov 2025 11:44:36 +0200
Subject: [PATCH 4/8] mtmd: instrumentation + preprocessing parity helper;
 stats & media marker logs for Eagle2-VL

---
 tools/mtmd/clip.cpp              | 161 ++++++++++++++++++++++++++-----
 tools/mtmd/hf_preprocess_dump.py |  42 ++++++++
 tools/mtmd/mtmd-cli.cpp          |  35 ++++++-
 tools/mtmd/mtmd.cpp              |  40 +++++++-
 4 files changed, 251 insertions(+), 27 deletions(-)
 create mode 100755 tools/mtmd/hf_preprocess_dump.py

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index c65a6bab9aafa..5e522f4bd43d8 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1,3 +1,4 @@
+// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission.
 // NOTE: This is modified from clip.cpp only for LLaVA,
 // so there might be still unnecessary artifacts hanging around
 // I'll gradually clean and extend it
@@ -1617,15 +1618,10 @@ struct clip_graph {
 
         // llava projector (also used by granite)
         if (ctx->model.hparams.has_llava_projector) {
+            // consume the full post-merge sequence directly; no row selection via patches
             embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
-
-            ggml_tensor * patches = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
-            ggml_set_name(patches, "patches");
-            ggml_set_input(patches);
-
-            // shape [1, 576, 1024]
-            // ne is whcn, ne = [1024, 576, 1, 1]
-            embeddings = ggml_get_rows(ctx0, embeddings, patches);
+            // Eagle2-VL patch: explicitly log that we are NOT performing any row gather (uninitialized indices avoided)
+            printf("[E2VL] projector: using full sequence (no row gather)\n");
 
             // print_tensor_info(embeddings, "embeddings");
 
@@ -1636,18 +1632,20 @@ struct clip_graph {
                     // ensure contiguous before reshape/permutation in patch merge
                     embeddings             = ggml_cont(ctx0, embeddings);
                     const int scale_factor = hparams.n_merge;
-                    // minimal debug: pre-merge C/T (embeddings)
-                    {
-                        int C = (int) embeddings->ne[0];
-                        int T = (int) embeddings->ne[1];
-                        printf("[E2VL] pre-merge: C=%d, T=%d\n", C, T);
-                    }
+                    int C_before = (int) embeddings->ne[0];
+                    int T_before = (int) embeddings->ne[1];
+                    printf("[E2VL] pre-merge: C=%d, T=%d (scale_factor=%d)\n", C_before, T_before, scale_factor);
                     embeddings = build_patch_merge_permute(embeddings, scale_factor);
-                    // minimal debug: post-merge C/T (embeddings)
-                    {
-                        int C_new = (int) embeddings->ne[0];
-                        int T_new = (int) embeddings->ne[1];
-                        printf("[E2VL] post-merge: C=%d, T=%d\n", C_new, T_new);
+                    int C_after = (int) embeddings->ne[0];
+                    int T_after = (int) embeddings->ne[1];
+                    printf("[E2VL] post-merge: C=%d, T=%d\n", C_after, T_after);
+                    int expected_C = C_before * scale_factor * scale_factor;
+                    int expected_T = T_before / (scale_factor * scale_factor);
+                    if (C_after != expected_C || T_after != expected_T) {
+                        printf("[E2VL] WARN: unexpected post-merge shape (possible double-merge?) got C=%d (exp %d) T=%d (exp %d)\n",
+                               C_after, expected_C, T_after, expected_T);
+                    } else {
+                        printf("[E2VL] merge check: single merge confirmed (C scales by %d^2, T divides by %d^2)\n", scale_factor, scale_factor);
                     }
                 }
                 LOG_INF("%s: llava-mlp before mm_0: emb[%lld, %lld], w0[%lld, %lld]\n", __func__,
@@ -1683,6 +1681,8 @@ struct clip_graph {
                     }
                     embeddings = ggml_mul_mat(ctx0, w2, embeddings);
                     embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+                    // tag for post-compute stats collection
+                    ggml_set_name(embeddings, "e2vl_proj_out");
                 }
             }
             else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) {
@@ -3727,12 +3727,70 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
 static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
     dst.nx = src.nx;
     dst.ny = src.ny;
-    dst.buf.resize(src.buf.size());
+    const size_t plane_sz = (size_t) dst.nx * (size_t) dst.ny;
+    dst.buf.resize(3 * plane_sz); // planar RGB
+
+    bool stats_enabled = std::getenv("E2VL_STATS") != nullptr;
+    double ch_sum[3] = {0.0,0.0,0.0};
+    double ch_min[3] = {1e9,1e9,1e9};
+    double ch_max[3] = {-1e9,-1e9,-1e9};
+    std::vector<float> ch_first8[3];
+    ch_first8[0].reserve(8); ch_first8[1].reserve(8); ch_first8[2].reserve(8);
+
+    for (int y = 0; y < dst.ny; ++y) {
+        for (int x = 0; x < dst.nx; ++x) {
+            size_t base = (size_t) y * (size_t) dst.nx + (size_t) x;
+            for (int c = 0; c < 3; ++c) {
+                size_t src_idx = 3ull * base + (size_t) c; // interleaved in src
+                float raw = static_cast<float>(src.buf[src_idx]) / 255.0f;
+                float v = (raw - mean[c]) / std[c];
+                size_t dst_idx = (size_t) c * plane_sz + base; // planar in dst
+                dst.buf[dst_idx] = v;
+                if (stats_enabled) {
+                    ch_sum[c] += v;
+                    ch_min[c] = std::min<double>(ch_min[c], v);
+                    ch_max[c] = std::max<double>(ch_max[c], v);
+                    if ((int)ch_first8[c].size() < 8) ch_first8[c].push_back(v);
+                }
+            }
+        }
+    }
 
-    // TODO @ngxson : seems like this could be done more efficiently on cgraph
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        int c = i % 3; // rgb
-        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
+    if (stats_enabled) {
+        const double denom = double(dst.nx * dst.ny);
+        double ch_mean[3] = { ch_sum[0] / denom, ch_sum[1] / denom, ch_sum[2] / denom };
+        printf("[E2VL] preprocess stats (RGB channel order)\n");
+        for (int c = 0; c < 3; ++c) {
+            printf("[E2VL] channel %d first8: ", c);
+            for (float v : ch_first8[c]) printf(" % .6f ", v);
+            printf("\n");
+            printf("[E2VL] channel %d min=% .6f max=% .6f mean=% .6f\n", c, ch_min[c], ch_max[c], ch_mean[c]);
+        }
+    }
+
+    if (std::getenv("E2VL_PRE_DUMP") != nullptr) {
+        const char * path = std::getenv("E2VL_PRE_CPP_OUT");
+        if (!path) path = "e2vl_pre_cpp.bin";
+        std::vector<float> planar(dst.buf.size());
+        for (int c = 0; c < 3; ++c) {
+            for (int y = 0; y < dst.ny; ++y) {
+                for (int x = 0; x < dst.nx; ++x) {
+                    size_t src_idx = (size_t) c * plane_sz + (size_t) y * dst.nx + (size_t) x;
+                    size_t dst_idx = src_idx;
+                    planar[dst_idx] = dst.buf[src_idx];
+                }
+            }
+        }
+        FILE * f = fopen(path, "wb");
+        if (f) {
+            fwrite(planar.data(), sizeof(float), planar.size(), f);
+            fclose(f);
+            if (stats_enabled) {
+                printf("[E2VL] preprocess dump written (planar RGB) path=%s size=%zu floats\n", path, (size_t)planar.size());
+            }
+        } else if (stats_enabled) {
+            printf("[E2VL] WARN: failed to open preprocess dump path %s\n", path);
+        }
     }
 }
 
@@ -5107,7 +5165,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 for (int i = 0; i < num_patches; i++) {
                     patches[i] = i + patch_offset;
                 }
-                set_input_i32("patches", patches);
+                // Make patches optional: if the graph doesn't contain an input named "patches"
+                // (Eagle2-VL full-sequence path), skip without aborting.
+                ggml_tensor * patches_tensor = ggml_graph_get_tensor(gf, "patches");
+                if (patches_tensor && (patches_tensor->flags & GGML_TENSOR_FLAG_INPUT)) {
+                    GGML_ASSERT(patches_tensor->type == GGML_TYPE_I32);
+                    GGML_ASSERT(ggml_nelements(patches_tensor) == (int64_t)patches.size());
+                    ggml_backend_tensor_set(patches_tensor, patches.data(), 0, ggml_nbytes(patches_tensor));
+                } else {
+                    // Only log in verbose contexts (llava projector present) to avoid spam for other models.
+                    if (ctx->model.hparams.has_llava_projector) {
+                        printf("[E2VL] no 'patches' tensor in graph (full-sequence path)\n");
+                    }
+                }
             } break;
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_IDEFICS3:
@@ -5158,6 +5228,47 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         return false;
     }
 
+    // E2VL projector output stats/dump (post-compute)
+    if (std::getenv("E2VL_STATS") != nullptr || std::getenv("E2VL_DUMP") != nullptr) {
+        ggml_tensor * proj = ggml_graph_get_tensor(gf, "e2vl_proj_out");
+        if (proj != nullptr) {
+            const int64_t C = proj->ne[0];
+            const int64_t T = proj->ne[1];
+            const int64_t N = C * T;
+            std::vector<float> buf((size_t) N);
+            ggml_backend_tensor_get(proj, buf.data(), 0, ggml_nbytes(proj));
+
+            long double sum = 0.0L, sq = 0.0L;
+            for (int64_t i = 0; i < N; ++i) {
+                const long double v = buf[(size_t) i];
+                sum += v;
+                sq  += v * v;
+            }
+            const long double mean = sum / (long double) N;
+            const long double var  = std::max<long double>(0.0L, sq / (long double) N - mean * mean);
+            const long double stdv = sqrt((double) var);
+            const long double l2n  = sqrt((double) sq);
+            if (std::getenv("E2VL_STATS") != nullptr) {
+                printf("[E2VL] projector out stats (after mm.2): shape=[%lld,%lld] mean=% .6Lf std=% .6Lf L2=% .6Lf\n",
+                       (long long) C, (long long) T, mean, stdv, l2n);
+            }
+            if (std::getenv("E2VL_DUMP") != nullptr) {
+                const char * path = std::getenv("E2VL_CPP_OUT");
+                if (!path) path = "e2vl_projector_cpp.bin";
+                FILE * f = fopen(path, "wb");
+                if (f) {
+                    fwrite(buf.data(), sizeof(float), (size_t) N, f);
+                    fclose(f);
+                    printf("[E2VL] projector output dumped to %s (N=%lld)\n", path, (long long) N);
+                } else {
+                    printf("[E2VL] WARN: failed to open dump path %s\n", path);
+                }
+            }
+        } else {
+            // Silent if not present to avoid noise on non-E2VL models
+        }
+    }
+
     // print debug nodes
     if (ctx->debug_graph) {
         LOG_INF("\n\n---\n\n");
diff --git a/tools/mtmd/hf_preprocess_dump.py b/tools/mtmd/hf_preprocess_dump.py
new file mode 100755
index 0000000000000..a70efe81cc233
--- /dev/null
+++ b/tools/mtmd/hf_preprocess_dump.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+"""
+hf_preprocess_dump.py
+Create a planar C,H,W float32 dump matching the C++ E2VL preprocessing (resize->center crop->normalize SigLIP mean=0.5 std=0.5).
+Usage: python3 hf_preprocess_dump.py input.jpg out.bin
+"""
+import sys
+from PIL import Image
+import numpy as np
+
+if len(sys.argv) < 3:
+    print("usage: hf_preprocess_dump.py INPUT_IMAGE OUT_BIN")
+    sys.exit(2)
+
+inp = sys.argv[1]
+out = sys.argv[2]
+
+image_size = 448
+mean = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+std = np.array([0.5, 0.5, 0.5], dtype=np.float32)
+
+img = Image.open(inp).convert('RGB')
+# Resize short side to image_size, keep aspect, then center crop
+w, h = img.size
+if w < h:
+    new_w = image_size
+    new_h = int(round(h * (image_size / w)))
+else:
+    new_h = image_size
+    new_w = int(round(w * (image_size / h)))
+img = img.resize((new_w, new_h), resample=Image.BICUBIC)
+# center crop
+left = (new_w - image_size) // 2
+top = (new_h - image_size) // 2
+img = img.crop((left, top, left + image_size, top + image_size))
+arr = np.array(img).astype(np.float32) / 255.0
+# Normalize (SigLIP style): (x - mean)/std
+arr = (arr - mean) / std
+# Convert to planar C,H,W
+planar = np.transpose(arr, (2,0,1)).astype(np.float32)
+planar.tofile(out)
+print(f"wrote {out}: shape={planar.shape}, bytes={planar.nbytes}")
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index 3e19e95958a2f..f7134d8a3f93d 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -1,3 +1,4 @@
+// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission.
 #include "arg.h"
 #include "log.h"
 #include "common.h"
@@ -179,6 +180,34 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
         generated_tokens.push_back(token_id);
         common_sampler_accept(ctx.smpl, token_id, true);
 
+        if (i == 0 && std::getenv("E2VL_STATS") != nullptr) {
+            // Dump top-10 logits used to sample the first generated token.
+            // Prefer llama_get_logits() over llama_get_logits_ith() since the latter may not be populated
+            // when using the helper chunk evaluation path.
+            const struct llama_vocab * v = llama_model_get_vocab(ctx.model);
+            const int n_vocab = llama_vocab_n_tokens(v);
+            const float * logits = llama_get_logits(ctx.lctx);
+            if (!logits) {
+                // fall back to ith accessor
+                logits = llama_get_logits_ith(ctx.lctx, 0);
+            }
+            if (logits) {
+                struct Item { int id; float logit; }; std::vector<Item> items; items.reserve(n_vocab);
+                for (int t = 0; t < n_vocab; ++t) items.push_back({t, logits[t]});
+                std::partial_sort(items.begin(), items.begin()+std::min<size_t>(10, items.size()), items.end(), [](const Item & a, const Item & b){return a.logit > b.logit;});
+                printf("[E2VL] first-token top10 logits:\n");
+                float denom = 0.0f; for (size_t j = 0; j < 10 && j < items.size(); ++j) denom += expf(items[j].logit - items[0].logit);
+                for (size_t k = 0; k < 10 && k < items.size(); ++k) {
+                    auto & it = items[k];
+                    std::string piece = common_token_to_piece(ctx.lctx, it.id);
+                    float prob = expf(it.logit - items[0].logit) / (denom > 0 ? denom : 1);
+                    printf("  id=%d piece='%s' logit=% .5f approx_prob=% .5f\n", it.id, piece.c_str(), it.logit, prob);
+                }
+            } else {
+                printf("[E2VL] WARN: logits unavailable for first-token probe (no logits pointer)\n");
+            }
+        }
+
         if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
             LOG("\n");
             break; // end of generation
@@ -311,7 +340,11 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
+        // TEMP: Guard against duplicate vision markers (safe for now, to be removed after EAGLE2_VL stabilization)
+        // Avoid auto-appending a media marker if the prompt already contains an IMG_CONTEXT placeholder
+        const bool has_default_media_marker = params.prompt.find(mtmd_default_marker()) != std::string::npos;
+        const bool has_img_context_placeholder = params.prompt.find("<IMG_CONTEXT>") != std::string::npos;
+        if (!has_default_media_marker && !has_img_context_placeholder) {
             for (size_t i = 0; i < params.image.size(); i++) {
                 params.prompt += mtmd_default_marker();
             }
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index 325f7ff995e36..ac499e0ff0403 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1,3 +1,4 @@
+// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission.
 #include "clip.h"
 #include "clip-impl.h"
 #include "mtmd.h"
@@ -765,7 +766,44 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             const mtmd_bitmap ** bitmaps,
             size_t n_bitmaps) {
     mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
-    return tokenizer.tokenize(output);
+    int32_t ret = tokenizer.tokenize(output);
+    if (ret == 0 && std::getenv("E2VL_STATS") != nullptr) {
+        // Inspect chunks to verify IMG_CONTEXT replacement: we should NOT see the literal media marker any more,
+        // instead we have an IMAGE chunk with projected embeddings.
+        size_t n_chunks = mtmd_input_chunks_size(output);
+        for (size_t i = 0; i < n_chunks; ++i) {
+            auto * chunk = mtmd_input_chunks_get(output, i);
+            if (!chunk) continue;
+            if (mtmd_input_chunk_get_type(chunk) == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+                // find left and right neighboring text token ids for context
+                llama_token left_id = -1;
+                llama_token right_id = -1;
+                // search left
+                for (int64_t j = (int64_t)i - 1; j >= 0; --j) {
+                    auto * c2 = mtmd_input_chunks_get(output, j);
+                    if (c2 && mtmd_input_chunk_get_type(c2) == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                        size_t ntt = 0; auto * toks = mtmd_input_chunk_get_tokens_text(c2, &ntt);
+                        if (ntt > 0) { left_id = toks[ntt - 1]; }
+                        break;
+                    }
+                }
+                // search right
+                for (size_t j = i + 1; j < n_chunks; ++j) {
+                    auto * c2 = mtmd_input_chunks_get(output, j);
+                    if (c2 && mtmd_input_chunk_get_type(c2) == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+                        size_t ntt = 0; auto * toks = mtmd_input_chunk_get_tokens_text(c2, &ntt);
+                        if (ntt > 0) { right_id = toks[0]; }
+                        break;
+                    }
+                }
+                const mtmd_image_tokens * img = mtmd_input_chunk_get_tokens_image(chunk);
+                uint32_t inserted_tokens = img ? img->n_tokens() : 0;
+                printf("[E2VL] IMG_CONTEXT replaced: left_id=%d right_id=%d inserted_image_tokens=%u\n",
+                       (int)left_id, (int)right_id, inserted_tokens);
+            }
+        }
+    }
+    return ret;
 }
 
 int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {

From 7bc9c9ec60975e2f618c81025494de5ef01f786c Mon Sep 17 00:00:00 2001
From: YaelGitAccount <Ya0504124870@gmail.com>
Date: Wed, 12 Nov 2025 12:12:00 +0200
Subject: [PATCH 5/8] mtmd: remove unused debug-only variables and no-op casts

---
 tools/mtmd/clip.cpp | 148 +++-----------------------------------------
 1 file changed, 8 insertions(+), 140 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 5e522f4bd43d8..0f64dc3fe139a 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1,8 +1,4 @@
-// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission.
-// NOTE: This is modified from clip.cpp only for LLaVA,
-// so there might be still unnecessary artifacts hanging around
-// I'll gradually clean and extend it
-// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
+#include <cstddef>
 #include "clip.h"
 #include "clip-impl.h"
 #include "ggml.h"
@@ -679,19 +675,7 @@ struct clip_graph {
             if (hparams.n_merge > 1 &&
                 (model.proj_type == PROJECTOR_TYPE_MLP || model.proj_type == PROJECTOR_TYPE_MLP_NORM)) {
                 const int scale_factor = hparams.n_merge;
-                // minimal debug: pre-merge C/T
-                {
-                    int C = (int) cur->ne[0];
-                    int T = (int) cur->ne[1];
-                    printf("[E2VL] pre-merge: C=%d, T=%d\n", C, T);
-                }
                 cur = build_patch_merge_permute(cur, scale_factor);
-                // minimal debug: post-merge C/T
-                {
-                    int C_new = (int) cur->ne[0];
-                    int T_new = (int) cur->ne[1];
-                    printf("[E2VL] post-merge: C=%d, T=%d\n", C_new, T_new);
-                }
             }
 
             // Use mm_0_w/mm_0_b if available (Eagle2-VL), otherwise mm_1_w/mm_1_b (standard LLaVA)
@@ -708,14 +692,6 @@ struct clip_graph {
 
             // Ensure 2D and correct orientation for matmul: first_w[out,in] x cur[in, tokens]
             cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1]);
-            // minimal debug: shapes before first matmul in this block
-            if (first_w) {
-                int emb_C  = (int) cur->ne[0];
-                int emb_T  = (int) cur->ne[1];
-                int w0_in  = (int) first_w->ne[0];
-                int w0_out = (int) first_w->ne[1];
-                printf("[E2VL] emb=[%d,%d], w0=[%d,%d]\n", emb_C, emb_T, w0_in, w0_out);
-            }
             if (first_w && first_w->ne[1] != cur->ne[0]) {
                 LOG_WRN("%s: eagle2-mlp: dim mismatch, transposing cur: first_w[in]=%lld, cur[0]=%lld, cur[1]=%lld\n",
                         __func__, (long long) first_w->ne[1], (long long) cur->ne[0], (long long) cur->ne[1]);
@@ -1621,7 +1597,7 @@ struct clip_graph {
             // consume the full post-merge sequence directly; no row selection via patches
             embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
             // Eagle2-VL patch: explicitly log that we are NOT performing any row gather (uninitialized indices avoided)
-            printf("[E2VL] projector: using full sequence (no row gather)\n");
+            // using full sequence (no row gather)
 
             // print_tensor_info(embeddings, "embeddings");
 
@@ -1632,35 +1608,14 @@ struct clip_graph {
                     // ensure contiguous before reshape/permutation in patch merge
                     embeddings             = ggml_cont(ctx0, embeddings);
                     const int scale_factor = hparams.n_merge;
-                    int C_before = (int) embeddings->ne[0];
-                    int T_before = (int) embeddings->ne[1];
-                    printf("[E2VL] pre-merge: C=%d, T=%d (scale_factor=%d)\n", C_before, T_before, scale_factor);
                     embeddings = build_patch_merge_permute(embeddings, scale_factor);
-                    int C_after = (int) embeddings->ne[0];
-                    int T_after = (int) embeddings->ne[1];
-                    printf("[E2VL] post-merge: C=%d, T=%d\n", C_after, T_after);
-                    int expected_C = C_before * scale_factor * scale_factor;
-                    int expected_T = T_before / (scale_factor * scale_factor);
-                    if (C_after != expected_C || T_after != expected_T) {
-                        printf("[E2VL] WARN: unexpected post-merge shape (possible double-merge?) got C=%d (exp %d) T=%d (exp %d)\n",
-                               C_after, expected_C, T_after, expected_T);
-                    } else {
-                        printf("[E2VL] merge check: single merge confirmed (C scales by %d^2, T divides by %d^2)\n", scale_factor, scale_factor);
-                    }
                 }
                 LOG_INF("%s: llava-mlp before mm_0: emb[%lld, %lld], w0[%lld, %lld]\n", __func__,
                         (long long) embeddings->ne[0], (long long) embeddings->ne[1], (long long) model.mm_0_w->ne[0],
                         (long long) model.mm_0_w->ne[1]);
                 ggml_tensor * w0 = model.mm_0_w;
                 // ggml expects w->ne[0] (in_dim) == emb->ne[0]. If loader stored [out,in], fix with transpose.
-                // minimal debug: shapes right before mm_0 matmul
-                {
-                    int emb_C  = (int) embeddings->ne[0];
-                    int emb_T  = (int) embeddings->ne[1];
-                    int w0_in  = (int) w0->ne[0];
-                    int w0_out = (int) w0->ne[1];
-                    printf("[E2VL] emb=[%d,%d], w0=[%d,%d]\n", emb_C, emb_T, w0_in, w0_out);
-                }
+                // shapes validated at runtime by checks below
                 if (w0->ne[0] != embeddings->ne[0] && w0->ne[1] == embeddings->ne[0]) {
                     LOG_WRN("%s: llava-mlp: transposing mm_0_w for mul_mat: w0[%lld, %lld] emb[%lld, %lld]", __func__,
                             (long long) w0->ne[0], (long long) w0->ne[1], (long long) embeddings->ne[0],
@@ -1681,8 +1636,6 @@ struct clip_graph {
                     }
                     embeddings = ggml_mul_mat(ctx0, w2, embeddings);
                     embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
-                    // tag for post-compute stats collection
-                    ggml_set_name(embeddings, "e2vl_proj_out");
                 }
             }
             else if (ctx->proj_type() == PROJECTOR_TYPE_MLP_NORM) {
@@ -2872,7 +2825,7 @@ struct clip_model_loader {
                         // Eagle2-VL: Load spatial merge size for patch merge
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                         // minimal debug: report n_merge loaded from metadata
-                        printf("[E2VL] n_merge=%d\n", hparams.n_merge);
+                        (void)hparams.n_merge;
                     }
                     break;
                 case PROJECTOR_TYPE_MINICPMV:
@@ -3730,12 +3683,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32
     const size_t plane_sz = (size_t) dst.nx * (size_t) dst.ny;
     dst.buf.resize(3 * plane_sz); // planar RGB
 
-    bool stats_enabled = std::getenv("E2VL_STATS") != nullptr;
-    double ch_sum[3] = {0.0,0.0,0.0};
-    double ch_min[3] = {1e9,1e9,1e9};
-    double ch_max[3] = {-1e9,-1e9,-1e9};
-    std::vector<float> ch_first8[3];
-    ch_first8[0].reserve(8); ch_first8[1].reserve(8); ch_first8[2].reserve(8);
+    // removed E2VL_STATS debug instrumentation
 
     for (int y = 0; y < dst.ny; ++y) {
         for (int x = 0; x < dst.nx; ++x) {
@@ -3746,52 +3694,11 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32
                 float v = (raw - mean[c]) / std[c];
                 size_t dst_idx = (size_t) c * plane_sz + base; // planar in dst
                 dst.buf[dst_idx] = v;
-                if (stats_enabled) {
-                    ch_sum[c] += v;
-                    ch_min[c] = std::min<double>(ch_min[c], v);
-                    ch_max[c] = std::max<double>(ch_max[c], v);
-                    if ((int)ch_first8[c].size() < 8) ch_first8[c].push_back(v);
-                }
+                
             }
         }
     }
 
-    if (stats_enabled) {
-        const double denom = double(dst.nx * dst.ny);
-        double ch_mean[3] = { ch_sum[0] / denom, ch_sum[1] / denom, ch_sum[2] / denom };
-        printf("[E2VL] preprocess stats (RGB channel order)\n");
-        for (int c = 0; c < 3; ++c) {
-            printf("[E2VL] channel %d first8: ", c);
-            for (float v : ch_first8[c]) printf(" % .6f ", v);
-            printf("\n");
-            printf("[E2VL] channel %d min=% .6f max=% .6f mean=% .6f\n", c, ch_min[c], ch_max[c], ch_mean[c]);
-        }
-    }
-
-    if (std::getenv("E2VL_PRE_DUMP") != nullptr) {
-        const char * path = std::getenv("E2VL_PRE_CPP_OUT");
-        if (!path) path = "e2vl_pre_cpp.bin";
-        std::vector<float> planar(dst.buf.size());
-        for (int c = 0; c < 3; ++c) {
-            for (int y = 0; y < dst.ny; ++y) {
-                for (int x = 0; x < dst.nx; ++x) {
-                    size_t src_idx = (size_t) c * plane_sz + (size_t) y * dst.nx + (size_t) x;
-                    size_t dst_idx = src_idx;
-                    planar[dst_idx] = dst.buf[src_idx];
-                }
-            }
-        }
-        FILE * f = fopen(path, "wb");
-        if (f) {
-            fwrite(planar.data(), sizeof(float), planar.size(), f);
-            fclose(f);
-            if (stats_enabled) {
-                printf("[E2VL] preprocess dump written (planar RGB) path=%s size=%zu floats\n", path, (size_t)planar.size());
-            }
-        } else if (stats_enabled) {
-            printf("[E2VL] WARN: failed to open preprocess dump path %s\n", path);
-        }
-    }
 }
 
 // set of tools to manupulate images
@@ -5175,7 +5082,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 } else {
                     // Only log in verbose contexts (llava projector present) to avoid spam for other models.
                     if (ctx->model.hparams.has_llava_projector) {
-                        printf("[E2VL] no 'patches' tensor in graph (full-sequence path)\n");
+                        // no 'patches' tensor in graph (full-sequence path)
                     }
                 }
             } break;
@@ -5228,46 +5135,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         return false;
     }
 
-    // E2VL projector output stats/dump (post-compute)
-    if (std::getenv("E2VL_STATS") != nullptr || std::getenv("E2VL_DUMP") != nullptr) {
-        ggml_tensor * proj = ggml_graph_get_tensor(gf, "e2vl_proj_out");
-        if (proj != nullptr) {
-            const int64_t C = proj->ne[0];
-            const int64_t T = proj->ne[1];
-            const int64_t N = C * T;
-            std::vector<float> buf((size_t) N);
-            ggml_backend_tensor_get(proj, buf.data(), 0, ggml_nbytes(proj));
-
-            long double sum = 0.0L, sq = 0.0L;
-            for (int64_t i = 0; i < N; ++i) {
-                const long double v = buf[(size_t) i];
-                sum += v;
-                sq  += v * v;
-            }
-            const long double mean = sum / (long double) N;
-            const long double var  = std::max<long double>(0.0L, sq / (long double) N - mean * mean);
-            const long double stdv = sqrt((double) var);
-            const long double l2n  = sqrt((double) sq);
-            if (std::getenv("E2VL_STATS") != nullptr) {
-                printf("[E2VL] projector out stats (after mm.2): shape=[%lld,%lld] mean=% .6Lf std=% .6Lf L2=% .6Lf\n",
-                       (long long) C, (long long) T, mean, stdv, l2n);
-            }
-            if (std::getenv("E2VL_DUMP") != nullptr) {
-                const char * path = std::getenv("E2VL_CPP_OUT");
-                if (!path) path = "e2vl_projector_cpp.bin";
-                FILE * f = fopen(path, "wb");
-                if (f) {
-                    fwrite(buf.data(), sizeof(float), (size_t) N, f);
-                    fclose(f);
-                    printf("[E2VL] projector output dumped to %s (N=%lld)\n", path, (long long) N);
-                } else {
-                    printf("[E2VL] WARN: failed to open dump path %s\n", path);
-                }
-            }
-        } else {
-            // Silent if not present to avoid noise on non-E2VL models
-        }
-    }
+    // removed E2VL projector stats/dump block
 
     // print debug nodes
     if (ctx->debug_graph) {

From d6cb43b638a25bbe5fba6b0acd775c9fcb406133 Mon Sep 17 00:00:00 2001
From: YaelGitAccount <Ya0504124870@gmail.com>
Date: Wed, 12 Nov 2025 16:02:15 +0200
Subject: [PATCH 6/8] =?UTF-8?q?feat(mtmd):=20integrate=20Eagle2-VL=20mmpro?=
 =?UTF-8?q?j=20=E2=80=94=20media=20ingest,=20projector=20routing,=20and=20?=
 =?UTF-8?q?shape=20validation?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/mtmd/clip.cpp     | 39 ++++++++----------------------------
 tools/mtmd/mtmd-cli.cpp | 38 +++--------------------------------
 tools/mtmd/mtmd.cpp     | 44 -----------------------------------------
 3 files changed, 11 insertions(+), 110 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 0f64dc3fe139a..f14eafa5589be 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -682,19 +682,10 @@ struct clip_graph {
             ggml_tensor * first_w = model.mm_0_w ? model.mm_0_w : model.mm_1_w;
             ggml_tensor * first_b = model.mm_0_b ? model.mm_0_b : model.mm_1_b;
 
-            // Debug shapes before projection to catch matmul mismatches
-            LOG_INF("%s: eagle2-mlp: cur shape:    [%lld, %lld, %lld]\n", __func__, (long long) cur->ne[0],
-                    (long long) cur->ne[1], (long long) cur->ne[2]);
-            if (first_w) {
-                LOG_INF("%s: eagle2-mlp: first_w:     [%lld, %lld]\n", __func__, (long long) first_w->ne[0],
-                        (long long) first_w->ne[1]);
-            }
-
             // Ensure 2D and correct orientation for matmul: first_w[out,in] x cur[in, tokens]
             cur = ggml_reshape_2d(ctx0, cur, cur->ne[0], cur->ne[1]);
             if (first_w && first_w->ne[1] != cur->ne[0]) {
-                LOG_WRN("%s: eagle2-mlp: dim mismatch, transposing cur: first_w[in]=%lld, cur[0]=%lld, cur[1]=%lld\n",
-                        __func__, (long long) first_w->ne[1], (long long) cur->ne[0], (long long) cur->ne[1]);
+                // transpose to match expected [in, tokens]
                 cur = ggml_transpose(ctx0, cur);
                 cur = ggml_cont(ctx0, cur);
             }
@@ -1596,30 +1587,20 @@ struct clip_graph {
         if (ctx->model.hparams.has_llava_projector) {
             // consume the full post-merge sequence directly; no row selection via patches
             embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
-            // Eagle2-VL patch: explicitly log that we are NOT performing any row gather (uninitialized indices avoided)
-            // using full sequence (no row gather)
-
-            // print_tensor_info(embeddings, "embeddings");
+            
 
             // llava projector
             if (ctx->proj_type() == PROJECTOR_TYPE_MLP) {
-                 // Eagle2-VL: apply 2x2 patch merge on [C, T] layout directly when n_merge > 1
+                 // apply 2x2 patch merge on [C, T] layout directly when n_merge > 1
                 if (hparams.n_merge > 1) {
                     // ensure contiguous before reshape/permutation in patch merge
                     embeddings             = ggml_cont(ctx0, embeddings);
                     const int scale_factor = hparams.n_merge;
                     embeddings = build_patch_merge_permute(embeddings, scale_factor);
                 }
-                LOG_INF("%s: llava-mlp before mm_0: emb[%lld, %lld], w0[%lld, %lld]\n", __func__,
-                        (long long) embeddings->ne[0], (long long) embeddings->ne[1], (long long) model.mm_0_w->ne[0],
-                        (long long) model.mm_0_w->ne[1]);
                 ggml_tensor * w0 = model.mm_0_w;
-                // ggml expects w->ne[0] (in_dim) == emb->ne[0]. If loader stored [out,in], fix with transpose.
-                // shapes validated at runtime by checks below
+                // ensure projector weight orientation matches embeddings
                 if (w0->ne[0] != embeddings->ne[0] && w0->ne[1] == embeddings->ne[0]) {
-                    LOG_WRN("%s: llava-mlp: transposing mm_0_w for mul_mat: w0[%lld, %lld] emb[%lld, %lld]", __func__,
-                            (long long) w0->ne[0], (long long) w0->ne[1], (long long) embeddings->ne[0],
-                            (long long) embeddings->ne[1]);
                     w0 = ggml_cont(ctx0, ggml_transpose(ctx0, w0));
                 }
                 embeddings = ggml_mul_mat(ctx0, w0, embeddings);
@@ -1627,11 +1608,8 @@ struct clip_graph {
 
                 embeddings = ggml_gelu(ctx0, embeddings);
                 if (model.mm_2_w) {
-                     ggml_tensor * w2 = model.mm_2_w;
+                    ggml_tensor * w2 = model.mm_2_w;
                     if (w2->ne[0] != embeddings->ne[0] && w2->ne[1] == embeddings->ne[0]) {
-                        LOG_WRN("%s: llava-mlp: transposing mm_2_w for mul_mat: w2[%lld, %lld] emb[%lld, %lld]",
-                                __func__, (long long) w2->ne[0], (long long) w2->ne[1], (long long) embeddings->ne[0],
-                                (long long) embeddings->ne[1]);
                         w2 = ggml_cont(ctx0, ggml_transpose(ctx0, w2));
                     }
                     embeddings = ggml_mul_mat(ctx0, w2, embeddings);
@@ -2824,8 +2802,7 @@ struct clip_model_loader {
                     {
                         // Eagle2-VL: Load spatial merge size for patch merge
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
-                        // minimal debug: report n_merge loaded from metadata
-                        (void)hparams.n_merge;
+                        (void)hparams.n_merge; // keep variable referenced even if unused
                     }
                     break;
                 case PROJECTOR_TYPE_MINICPMV:
@@ -3683,7 +3660,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32
     const size_t plane_sz = (size_t) dst.nx * (size_t) dst.ny;
     dst.buf.resize(3 * plane_sz); // planar RGB
 
-    // removed E2VL_STATS debug instrumentation
+    
 
     for (int y = 0; y < dst.ny; ++y) {
         for (int x = 0; x < dst.nx; ++x) {
@@ -5135,7 +5112,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         return false;
     }
 
-    // removed E2VL projector stats/dump block
+    
 
     // print debug nodes
     if (ctx->debug_graph) {
diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index f7134d8a3f93d..d0c9c961cc156 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -1,4 +1,3 @@
-// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission.
 #include "arg.h"
 #include "log.h"
 #include "common.h"
@@ -180,33 +179,7 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
         generated_tokens.push_back(token_id);
         common_sampler_accept(ctx.smpl, token_id, true);
 
-        if (i == 0 && std::getenv("E2VL_STATS") != nullptr) {
-            // Dump top-10 logits used to sample the first generated token.
-            // Prefer llama_get_logits() over llama_get_logits_ith() since the latter may not be populated
-            // when using the helper chunk evaluation path.
-            const struct llama_vocab * v = llama_model_get_vocab(ctx.model);
-            const int n_vocab = llama_vocab_n_tokens(v);
-            const float * logits = llama_get_logits(ctx.lctx);
-            if (!logits) {
-                // fall back to ith accessor
-                logits = llama_get_logits_ith(ctx.lctx, 0);
-            }
-            if (logits) {
-                struct Item { int id; float logit; }; std::vector<Item> items; items.reserve(n_vocab);
-                for (int t = 0; t < n_vocab; ++t) items.push_back({t, logits[t]});
-                std::partial_sort(items.begin(), items.begin()+std::min<size_t>(10, items.size()), items.end(), [](const Item & a, const Item & b){return a.logit > b.logit;});
-                printf("[E2VL] first-token top10 logits:\n");
-                float denom = 0.0f; for (size_t j = 0; j < 10 && j < items.size(); ++j) denom += expf(items[j].logit - items[0].logit);
-                for (size_t k = 0; k < 10 && k < items.size(); ++k) {
-                    auto & it = items[k];
-                    std::string piece = common_token_to_piece(ctx.lctx, it.id);
-                    float prob = expf(it.logit - items[0].logit) / (denom > 0 ? denom : 1);
-                    printf("  id=%d piece='%s' logit=% .5f approx_prob=% .5f\n", it.id, piece.c_str(), it.logit, prob);
-                }
-            } else {
-                printf("[E2VL] WARN: logits unavailable for first-token probe (no logits pointer)\n");
-            }
-        }
+        
 
         if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
             LOG("\n");
@@ -240,8 +213,7 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
 }
 
 static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
-    LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
-        new_msg.role.c_str(), new_msg.content.c_str());
+    // format and append message
     auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
         new_msg, new_msg.role == "user",
         ctx.use_jinja);
@@ -252,7 +224,6 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
 static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
     bool add_bos = ctx.chat_history.empty();
     auto formatted_chat = chat_add_and_format(ctx, msg);
-    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
 
     mtmd_input_text text;
     text.text          = formatted_chat.c_str();
@@ -340,11 +311,8 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        // TEMP: Guard against duplicate vision markers (safe for now, to be removed after EAGLE2_VL stabilization)
-        // Avoid auto-appending a media marker if the prompt already contains an IMG_CONTEXT placeholder
         const bool has_default_media_marker = params.prompt.find(mtmd_default_marker()) != std::string::npos;
-        const bool has_img_context_placeholder = params.prompt.find("<IMG_CONTEXT>") != std::string::npos;
-        if (!has_default_media_marker && !has_img_context_placeholder) {
+        if (!has_default_media_marker) {
             for (size_t i = 0; i < params.image.size(); i++) {
                 params.prompt += mtmd_default_marker();
             }
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index ac499e0ff0403..e7ed5593aa690 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -1,4 +1,3 @@
-// TODO(E2VL_CLEANUP): Remove debug instrumentation and env-flag diagnostics before upstream submission.
 #include "clip.h"
 #include "clip-impl.h"
 #include "mtmd.h"
@@ -480,7 +479,6 @@ struct mtmd_tokenizer {
     }
 
     void add_text(const std::string & txt, bool parse_special) {
-        LOG_DBG("%s: %s\n", __func__, txt.c_str());
         auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
         add_text(tokens);
     }
@@ -614,10 +612,6 @@ struct mtmd_tokenizer {
                 image_tokens->batch_f32 = std::move(batch_f32);
                 image_tokens->id = bitmap->id; // optional
 
-                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
-                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
-                LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
-
                 mtmd_input_chunk chunk{
                     MTMD_INPUT_CHUNK_TYPE_IMAGE,
                     {}, // text tokens
@@ -677,8 +671,6 @@ struct mtmd_tokenizer {
                 audio_tokens->batch_f32 = std::move(batch_f32);
                 audio_tokens->id = bitmap->id; // optional
 
-                LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
-
                 mtmd_input_chunk chunk{
                     MTMD_INPUT_CHUNK_TYPE_AUDIO,
                     {}, // text tokens
@@ -767,42 +759,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             size_t n_bitmaps) {
     mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
     int32_t ret = tokenizer.tokenize(output);
-    if (ret == 0 && std::getenv("E2VL_STATS") != nullptr) {
-        // Inspect chunks to verify IMG_CONTEXT replacement: we should NOT see the literal media marker any more,
-        // instead we have an IMAGE chunk with projected embeddings.
-        size_t n_chunks = mtmd_input_chunks_size(output);
-        for (size_t i = 0; i < n_chunks; ++i) {
-            auto * chunk = mtmd_input_chunks_get(output, i);
-            if (!chunk) continue;
-            if (mtmd_input_chunk_get_type(chunk) == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-                // find left and right neighboring text token ids for context
-                llama_token left_id = -1;
-                llama_token right_id = -1;
-                // search left
-                for (int64_t j = (int64_t)i - 1; j >= 0; --j) {
-                    auto * c2 = mtmd_input_chunks_get(output, j);
-                    if (c2 && mtmd_input_chunk_get_type(c2) == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-                        size_t ntt = 0; auto * toks = mtmd_input_chunk_get_tokens_text(c2, &ntt);
-                        if (ntt > 0) { left_id = toks[ntt - 1]; }
-                        break;
-                    }
-                }
-                // search right
-                for (size_t j = i + 1; j < n_chunks; ++j) {
-                    auto * c2 = mtmd_input_chunks_get(output, j);
-                    if (c2 && mtmd_input_chunk_get_type(c2) == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-                        size_t ntt = 0; auto * toks = mtmd_input_chunk_get_tokens_text(c2, &ntt);
-                        if (ntt > 0) { right_id = toks[0]; }
-                        break;
-                    }
-                }
-                const mtmd_image_tokens * img = mtmd_input_chunk_get_tokens_image(chunk);
-                uint32_t inserted_tokens = img ? img->n_tokens() : 0;
-                printf("[E2VL] IMG_CONTEXT replaced: left_id=%d right_id=%d inserted_image_tokens=%u\n",
-                       (int)left_id, (int)right_id, inserted_tokens);
-            }
-        }
-    }
     return ret;
 }
 

From 0fcfb37dc5ab7b9f66231a5a971ff16d5c00448d Mon Sep 17 00:00:00 2001
From: YaelGitAccount <Ya0504124870@gmail.com>
Date: Wed, 12 Nov 2025 18:28:10 +0200
Subject: [PATCH 7/8] =?UTF-8?q?feat(mtmd):=20integrate=20Eagle2-VL=20mmpro?=
 =?UTF-8?q?j=20=E2=80=94=20media=20ingest,=20projector=20routing,=20and=20?=
 =?UTF-8?q?shape=20validation\n\n-=20Scope=20strictly=20to=20Eagle2-VL=20w?=
 =?UTF-8?q?ith=20config/arch=20guards\n-=20Remove=20temporary=20debug=20lo?=
 =?UTF-8?q?gs;=20preserve=20upstream=20logging=20semantics\n-=20Keep=20tok?=
 =?UTF-8?q?enizer/encode=20path=20clean;=20no=20behavior=20change=20for=20?=
 =?UTF-8?q?non-vision=20models?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tools/mtmd/hf_preprocess_dump.py | 42 --------------------------------
 1 file changed, 42 deletions(-)
 delete mode 100755 tools/mtmd/hf_preprocess_dump.py

diff --git a/tools/mtmd/hf_preprocess_dump.py b/tools/mtmd/hf_preprocess_dump.py
deleted file mode 100755
index a70efe81cc233..0000000000000
--- a/tools/mtmd/hf_preprocess_dump.py
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env python3
-"""
-hf_preprocess_dump.py
-Create a planar C,H,W float32 dump matching the C++ E2VL preprocessing (resize->center crop->normalize SigLIP mean=0.5 std=0.5).
-Usage: python3 hf_preprocess_dump.py input.jpg out.bin
-"""
-import sys
-from PIL import Image
-import numpy as np
-
-if len(sys.argv) < 3:
-    print("usage: hf_preprocess_dump.py INPUT_IMAGE OUT_BIN")
-    sys.exit(2)
-
-inp = sys.argv[1]
-out = sys.argv[2]
-
-image_size = 448
-mean = np.array([0.5, 0.5, 0.5], dtype=np.float32)
-std = np.array([0.5, 0.5, 0.5], dtype=np.float32)
-
-img = Image.open(inp).convert('RGB')
-# Resize short side to image_size, keep aspect, then center crop
-w, h = img.size
-if w < h:
-    new_w = image_size
-    new_h = int(round(h * (image_size / w)))
-else:
-    new_h = image_size
-    new_w = int(round(w * (image_size / h)))
-img = img.resize((new_w, new_h), resample=Image.BICUBIC)
-# center crop
-left = (new_w - image_size) // 2
-top = (new_h - image_size) // 2
-img = img.crop((left, top, left + image_size, top + image_size))
-arr = np.array(img).astype(np.float32) / 255.0
-# Normalize (SigLIP style): (x - mean)/std
-arr = (arr - mean) / std
-# Convert to planar C,H,W
-planar = np.transpose(arr, (2,0,1)).astype(np.float32)
-planar.tofile(out)
-print(f"wrote {out}: shape={planar.shape}, bytes={planar.nbytes}")

From f83bb7b6583a59d1536a8c5d94e349a223ee1248 Mon Sep 17 00:00:00 2001
From: YaelGitAccount <Ya0504124870@gmail.com>
Date: Wed, 12 Nov 2025 23:28:35 +0200
Subject: [PATCH 8/8] chore(mtmd): revert mtmd{,-cli}.cpp to upstream/master;
 keep Eagle2-VL logic isolated in clip.cpp

---
 tools/mtmd/mtmd-cli.cpp |  9 ++++----
 tools/mtmd/mtmd.cpp     | 47 +++++++++++++++++++++++------------------
 2 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp
index d0c9c961cc156..3e19e95958a2f 100644
--- a/tools/mtmd/mtmd-cli.cpp
+++ b/tools/mtmd/mtmd-cli.cpp
@@ -179,8 +179,6 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
         generated_tokens.push_back(token_id);
         common_sampler_accept(ctx.smpl, token_id, true);
 
-        
-
         if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
             LOG("\n");
             break; // end of generation
@@ -213,7 +211,8 @@ static int generate_response(mtmd_cli_context & ctx, int n_predict) {
 }
 
 static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & new_msg) {
-    // format and append message
+    LOG_DBG("chat_add_and_format: new_msg.role='%s', new_msg.content='%s'\n",
+        new_msg.role.c_str(), new_msg.content.c_str());
     auto formatted = common_chat_format_single(ctx.tmpls.get(), ctx.chat_history,
         new_msg, new_msg.role == "user",
         ctx.use_jinja);
@@ -224,6 +223,7 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg &
 static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) {
     bool add_bos = ctx.chat_history.empty();
     auto formatted_chat = chat_add_and_format(ctx, msg);
+    LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str());
 
     mtmd_input_text text;
     text.text          = formatted_chat.c_str();
@@ -311,8 +311,7 @@ int main(int argc, char ** argv) {
 
     if (is_single_turn) {
         g_is_generating = true;
-        const bool has_default_media_marker = params.prompt.find(mtmd_default_marker()) != std::string::npos;
-        if (!has_default_media_marker) {
+        if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
             for (size_t i = 0; i < params.image.size(); i++) {
                 params.prompt += mtmd_default_marker();
             }
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp
index e7ed5593aa690..e599137769963 100644
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -101,16 +101,17 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_
 }
 
 mtmd_context_params mtmd_context_params_default() {
-    mtmd_context_params params;
-    params.use_gpu = true;
-    params.print_timings = true;
-    params.n_threads = 4;
-    params.verbosity = GGML_LOG_LEVEL_INFO;
-    params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
-    params.media_marker = mtmd_default_marker();
-    params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
-    params.image_min_tokens = -1;
-    params.image_max_tokens = -1;
+    mtmd_context_params params {
+        /* use_gpu           */ true,
+        /* print_timings     */ true,
+        /* n_threads         */ 4,
+        /* verbosity         */ GGML_LOG_LEVEL_INFO,
+        /* image_marker      */ MTMD_DEFAULT_IMAGE_MARKER,
+        /* media_marker      */ mtmd_default_marker(),
+        /* flash_attn_type   */ LLAMA_FLASH_ATTN_TYPE_AUTO,
+        /* image_min_tokens  */ -1,
+        /* image_max_tokens  */ -1,
+    };
     return params;
 }
 
@@ -162,7 +163,7 @@ struct mtmd_context {
         print_timings(ctx_params.print_timings),
         n_threads    (ctx_params.n_threads),
         media_marker (ctx_params.media_marker),
-        n_embd_text  (llama_model_n_embd(text_model))
+        n_embd_text  (llama_model_n_embd_inp(text_model))
     {
         if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
             throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
@@ -172,13 +173,13 @@ struct mtmd_context {
             throw std::runtime_error("media_marker must not be empty");
         }
 
-        clip_context_params ctx_clip_params;
-        ctx_clip_params.use_gpu          = ctx_params.use_gpu;
-        ctx_clip_params.verbosity        = ctx_params.verbosity;
-        ctx_clip_params.flash_attn_type  = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
-        // custom image token limits
-        ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens;
-        ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens;
+        clip_context_params ctx_clip_params {
+            /* use_gpu           */ ctx_params.use_gpu,
+            /* verbosity         */ ctx_params.verbosity,
+            /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
+            /* image_min_tokens  */ ctx_params.image_min_tokens,
+            /* image_max_tokens  */ ctx_params.image_max_tokens,
+        };
 
         auto res = clip_init(mmproj_fname, ctx_clip_params);
         ctx_v = res.ctx_v;
@@ -479,6 +480,7 @@ struct mtmd_tokenizer {
     }
 
     void add_text(const std::string & txt, bool parse_special) {
+        LOG_DBG("%s: %s\n", __func__, txt.c_str());
         auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
         add_text(tokens);
     }
@@ -612,6 +614,10 @@ struct mtmd_tokenizer {
                 image_tokens->batch_f32 = std::move(batch_f32);
                 image_tokens->id = bitmap->id; // optional
 
+                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
+                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
+                LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
+
                 mtmd_input_chunk chunk{
                     MTMD_INPUT_CHUNK_TYPE_IMAGE,
                     {}, // text tokens
@@ -671,6 +677,8 @@ struct mtmd_tokenizer {
                 audio_tokens->batch_f32 = std::move(batch_f32);
                 audio_tokens->id = bitmap->id; // optional
 
+                LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
+
                 mtmd_input_chunk chunk{
                     MTMD_INPUT_CHUNK_TYPE_AUDIO,
                     {}, // text tokens
@@ -758,8 +766,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             const mtmd_bitmap ** bitmaps,
             size_t n_bitmaps) {
     mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
-    int32_t ret = tokenizer.tokenize(output);
-    return ret;
+    return tokenizer.tokenize(output);
 }
 
 int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {