-
Notifications
You must be signed in to change notification settings - Fork 13.8k
common : more accurate sampling timing #17382
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
81f238b
cecc8b7
bc8587a
22e9963
c051897
96566b0
4c26677
f99ce95
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,9 +3,10 @@ | |
| #include "common.h" | ||
| #include "log.h" | ||
|
|
||
| #include <algorithm> | ||
| #include <cmath> | ||
| #include <cstring> | ||
| #include <unordered_map> | ||
| #include <algorithm> | ||
|
|
||
| // the ring buffer works similarly to std::deque, but with a fixed capacity | ||
| // TODO: deduplicate with llama-impl.h | ||
|
|
@@ -112,6 +113,13 @@ struct common_sampler { | |
|
|
||
| llama_token_data_array cur_p; | ||
|
|
||
| void reset() { | ||
| prev.clear(); | ||
|
|
||
| llama_sampler_reset(grmr); | ||
| llama_sampler_reset(chain); | ||
| } | ||
|
|
||
| void set_logits(struct llama_context * ctx, int idx) { | ||
| const auto * logits = llama_get_logits_ith(ctx, idx); | ||
|
|
||
|
|
@@ -128,6 +136,12 @@ struct common_sampler { | |
|
|
||
| cur_p = { cur.data(), cur.size(), -1, false }; | ||
| } | ||
|
|
||
| common_time_meas tm() { | ||
| return common_time_meas(t_total_us, params.no_perf); | ||
| } | ||
|
|
||
| mutable int64_t t_total_us = 0; | ||
| }; | ||
|
|
||
| std::string common_params_sampling::print() const { | ||
|
|
@@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) { | |
| } | ||
|
|
||
| void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { | ||
| const auto tm = gsmpl->tm(); | ||
|
|
||
| if (accept_grammar) { | ||
| llama_sampler_accept(gsmpl->grmr, token); | ||
| } | ||
|
|
@@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo | |
| } | ||
|
|
||
| void common_sampler_reset(struct common_sampler * gsmpl) { | ||
| llama_sampler_reset(gsmpl->grmr); | ||
|
|
||
| llama_sampler_reset(gsmpl->chain); | ||
| gsmpl->reset(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously we did not reset prev ringbuffer, I presume this is a bugfix? |
||
| } | ||
|
|
||
| struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { | ||
|
|
@@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { | |
| void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { | ||
| // TODO: measure grammar performance | ||
|
|
||
| const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0; | ||
|
|
||
| llama_perf_sampler_data data_smpl; | ||
| llama_perf_context_data data_ctx; | ||
|
|
||
| memset(&data_smpl, 0, sizeof(data_smpl)); | ||
| memset(&data_ctx, 0, sizeof(data_ctx)); | ||
|
|
||
| if (gsmpl) { | ||
| llama_perf_sampler_print(gsmpl->chain); | ||
| auto & data = data_smpl; | ||
|
|
||
| data = llama_perf_sampler(gsmpl->chain); | ||
|
|
||
| // note: the sampling time includes the samplers time + extra time spent in common/sampling | ||
| LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms); | ||
| LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample); | ||
| } | ||
|
|
||
| if (ctx) { | ||
| llama_perf_context_print(ctx); | ||
| auto & data = data_ctx; | ||
|
|
||
| data = llama_perf_context(ctx); | ||
|
|
||
| const double t_end_ms = 1e-3 * ggml_time_us(); | ||
|
|
||
| const double t_total_ms = t_end_ms - data.t_start_ms; | ||
| const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms); | ||
| const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms; | ||
|
|
||
| LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); | ||
| LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", | ||
| __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); | ||
| LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", | ||
| __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); | ||
| LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); | ||
| LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc); | ||
| LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused); | ||
|
|
||
| llama_memory_breakdown_print(ctx); | ||
| } | ||
| } | ||
|
|
||
| llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { | ||
| llama_synchronize(ctx); | ||
|
|
||
| // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations | ||
| const auto tm = gsmpl->tm(); | ||
|
|
||
| gsmpl->set_logits(ctx, idx); | ||
|
|
||
| auto & grmr = gsmpl->grmr; | ||
|
|
@@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { | |
| // helpers | ||
|
|
||
| llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) { | ||
| const auto tm = gsmpl->tm(); | ||
|
|
||
| auto * res = &gsmpl->cur_p; | ||
|
|
||
| if (do_sort && !res->sorted) { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -147,11 +147,15 @@ int main(int argc, char ** argv) { | |
| return 1; | ||
| } | ||
|
|
||
| auto * mem = llama_get_memory(ctx); | ||
|
|
||
| llama_memory_t mem = llama_get_memory(ctx); | ||
| const llama_vocab * vocab = llama_model_get_vocab(model); | ||
|
|
||
| // note: the time for chat template initialization is not negligible: | ||
| auto chat_templates = common_chat_templates_init(model, params.chat_template); | ||
|
|
||
| // start measuring performance timings from here | ||
| llama_perf_context_reset(ctx); | ||
|
|
||
|
Comment on lines
+152
to
+158
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Didn't realize until now that this chat template initialization call can take a significant amount of time (tens of milliseconds). Accounting for this, now the reported timings for sampling, prompt eval, eval and total add up nicely. |
||
| LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); | ||
|
|
||
| auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This struct is effectively a code dup of
time_measdefined inllama-impl.h. Not sure if this is something we would like to avoidThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's OK to duplicate as it is quite simple functionality