Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tools/mtmd/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2761,6 +2761,7 @@ struct clip_model_loader {
{
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
// TODO: verify the image_min_tokens
hparams.n_merge = 1; // the original pixtral does not use patch merging
hparams.rope_theta = 10000.0f;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(8, 1024);
Expand Down
35 changes: 18 additions & 17 deletions tools/mtmd/mtmd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,16 +101,17 @@ static clip_flash_attn_type mtmd_get_clip_flash_attn_type(enum llama_flash_attn_
}

mtmd_context_params mtmd_context_params_default() {
mtmd_context_params params;
params.use_gpu = true;
params.print_timings = true;
params.n_threads = 4;
params.verbosity = GGML_LOG_LEVEL_INFO;
params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
params.media_marker = mtmd_default_marker();
params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
params.image_min_tokens = -1;
params.image_max_tokens = -1;
mtmd_context_params params {
/* use_gpu */ true,
/* print_timings */ true,
/* n_threads */ 4,
/* verbosity */ GGML_LOG_LEVEL_INFO,
/* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
/* media_marker */ mtmd_default_marker(),
/* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ -1,
/* image_max_tokens */ -1,
};
return params;
}

Expand Down Expand Up @@ -172,13 +173,13 @@ struct mtmd_context {
throw std::runtime_error("media_marker must not be empty");
}

clip_context_params ctx_clip_params;
ctx_clip_params.use_gpu = ctx_params.use_gpu;
ctx_clip_params.verbosity = ctx_params.verbosity;
ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
// custom image token limits
ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens;
ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens;
clip_context_params ctx_clip_params {
/* use_gpu */ ctx_params.use_gpu,
/* verbosity */ ctx_params.verbosity,
/* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
/* image_min_tokens */ ctx_params.image_min_tokens,
/* image_max_tokens */ ctx_params.image_max_tokens,
};

auto res = clip_init(mmproj_fname, ctx_clip_params);
ctx_v = res.ctx_v;
Expand Down
Loading