From 0b133f9e7dad67d1edd1c39ce52dc38071ba660d Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Sun, 30 Nov 2025 11:59:02 +0100 Subject: [PATCH 1/2] model : Fix media marker placement for LFM2-VL in single turn llama-mtmd-cli --- tools/mtmd/mtmd-cli.cpp | 11 ++++++++++- tools/mtmd/mtmd.cpp | 8 ++++++++ tools/mtmd/mtmd.h | 7 +++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index 6679de309b4..6b939710dce 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -265,6 +265,15 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { return 0; } +static std::string insert_default_marker(mtmd_context * ctx, const std::string & msg) { + switch (mtmd_get_default_marker_placement(ctx)) { + case MTMD_DEFAULT_MARKER_PLACEMENT_BEGIN: return mtmd_default_marker() + msg; + case MTMD_DEFAULT_MARKER_PLACEMENT_NONE: + case MTMD_DEFAULT_MARKER_PLACEMENT_END: + default: return msg + mtmd_default_marker(); + } +} + int main(int argc, char ** argv) { ggml_time_init(); @@ -313,7 +322,7 @@ int main(int argc, char ** argv) { g_is_generating = true; if (params.prompt.find(mtmd_default_marker()) == std::string::npos) { for (size_t i = 0; i < params.image.size(); i++) { - params.prompt += mtmd_default_marker(); + params.prompt = insert_default_marker(ctx.ctx_vision.get(), params.prompt); } } common_chat_msg msg; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index dfad9cd7957..e29733502e0 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -1099,3 +1099,11 @@ void mtmd_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : clip_log_callback_default; g_logger_state.log_callback_user_data = user_data; } + +mtmd_default_marker_placement mtmd_get_default_marker_placement(mtmd_context * ctx) { + if (ctx && ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_LFM2) { + return MTMD_DEFAULT_MARKER_PLACEMENT_BEGIN; + } + + return MTMD_DEFAULT_MARKER_PLACEMENT_NONE; +} diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 015119be897..58630377061 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -51,6 +51,12 @@ enum mtmd_input_chunk_type { MTMD_INPUT_CHUNK_TYPE_AUDIO, }; +enum mtmd_default_marker_placement { + MTMD_DEFAULT_MARKER_PLACEMENT_NONE, // place media marker freely inside the message + MTMD_DEFAULT_MARKER_PLACEMENT_BEGIN, // place media marker in the beginning of the message + MTMD_DEFAULT_MARKER_PLACEMENT_END, // place media marker in the end of the message +}; + // opaque types struct mtmd_context; struct mtmd_bitmap; @@ -89,6 +95,7 @@ struct mtmd_context_params { }; MTMD_API const char * mtmd_default_marker(void); +MTMD_API mtmd_default_marker_placement mtmd_get_default_marker_placement(mtmd_context * ctx); MTMD_API struct mtmd_context_params mtmd_context_params_default(void); From 73ff3d0c09f6e5620d4c7905900f3bd6951f795e Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Sun, 30 Nov 2025 12:24:41 +0100 Subject: [PATCH 2/2] Formatting --- tools/mtmd/mtmd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index 58630377061..fff1ac8ec82 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -94,7 +94,7 @@ struct mtmd_context_params { int image_max_tokens; // maximum number of tokens for image input (default: read from metadata) }; -MTMD_API const char * mtmd_default_marker(void); +MTMD_API const char * mtmd_default_marker(void); MTMD_API mtmd_default_marker_placement mtmd_get_default_marker_placement(mtmd_context * ctx); MTMD_API struct mtmd_context_params mtmd_context_params_default(void);