From 587af774cd53511353882ec584e11c2c218d4cbe Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Mon, 3 Nov 2025 22:13:18 +0100 Subject: [PATCH] mtmd: improve struct initialization --- tools/mtmd/clip.cpp | 1 + tools/mtmd/mtmd.cpp | 35 ++++++++++++++++++----------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 99775cb3e351c..67a8a470c8302 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2761,6 +2761,7 @@ struct clip_model_loader { { // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json // TODO: verify the image_min_tokens + hparams.n_merge = 1; // the original pixtral does not use patch merging hparams.rope_theta = 10000.0f; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false); hparams.set_limit_image_tokens(8, 1024); diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 325f7ff995e36..4343f3b6fc3b6 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -101,16 +101,17 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_ } mtmd_context_params mtmd_context_params_default() { - mtmd_context_params params; - params.use_gpu = true; - params.print_timings = true; - params.n_threads = 4; - params.verbosity = GGML_LOG_LEVEL_INFO; - params.image_marker = MTMD_DEFAULT_IMAGE_MARKER; - params.media_marker = mtmd_default_marker(); - params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO; - params.image_min_tokens = -1; - params.image_max_tokens = -1; + mtmd_context_params params { + /* use_gpu */ true, + /* print_timings */ true, + /* n_threads */ 4, + /* verbosity */ GGML_LOG_LEVEL_INFO, + /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER, + /* media_marker */ mtmd_default_marker(), + /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO, + /* image_min_tokens */ -1, + /* image_max_tokens */ -1, + }; return params; } @@ -172,13 +173,13 @@ struct mtmd_context { throw std::runtime_error("media_marker must not be empty"); } - clip_context_params ctx_clip_params; - ctx_clip_params.use_gpu = ctx_params.use_gpu; - ctx_clip_params.verbosity = ctx_params.verbosity; - ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type); - // custom image token limits - ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens; - ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens; + clip_context_params ctx_clip_params { + /* use_gpu */ ctx_params.use_gpu, + /* verbosity */ ctx_params.verbosity, + /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO, + /* image_min_tokens */ ctx_params.image_min_tokens, + /* image_max_tokens */ ctx_params.image_max_tokens, + }; auto res = clip_init(mmproj_fname, ctx_clip_params); ctx_v = res.ctx_v;