diff --git a/common/common.cpp b/common/common.cpp index 4dc95dcba2260..f3cc55247e718 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -26,7 +26,6 @@ #include #include #include -#include #include #include @@ -60,6 +59,14 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif +common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} + +common_time_meas::~common_time_meas() { + if (t_start_us >= 0) { + t_acc += ggml_time_us() - t_start_us; + } +} + // // CPU utils // diff --git a/common/common.h b/common/common.h index f42c083faa254..de5b404dd8895 100644 --- a/common/common.h +++ b/common/common.h @@ -2,17 +2,15 @@ #pragma once +#include "ggml-opt.h" +#include "llama-cpp.h" + #include #include #include #include #include #include -#include -#include - -#include "ggml-opt.h" -#include "llama-cpp.h" #ifdef _WIN32 #define DIRECTORY_SEPARATOR '\\' @@ -30,6 +28,15 @@ #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" +struct common_time_meas { + common_time_meas(int64_t & t_acc, bool disable = false); + ~common_time_meas(); + + const int64_t t_start_us; + + int64_t & t_acc; +}; + struct common_adapter_lora_info { std::string path; float scale; diff --git a/common/sampling.cpp b/common/sampling.cpp index c69d525b5b358..7a6b7be1e0ee6 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -3,9 +3,10 @@ #include "common.h" #include "log.h" +#include #include +#include #include -#include // the ring buffer works similarly to std::deque, but with a fixed capacity // TODO: deduplicate with llama-impl.h @@ -112,6 +113,13 @@ struct common_sampler { llama_token_data_array cur_p; + void reset() { + prev.clear(); + + llama_sampler_reset(grmr); + llama_sampler_reset(chain); + } + void set_logits(struct llama_context * ctx, int idx) { const auto * logits = llama_get_logits_ith(ctx, idx); @@ -128,6 +136,12 @@ struct common_sampler { cur_p = { cur.data(), cur.size(), -1, false }; } + + common_time_meas tm() { + return common_time_meas(t_total_us, params.no_perf); + } + + mutable int64_t t_total_us = 0; }; std::string common_params_sampling::print() const { @@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) { } void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) { + const auto tm = gsmpl->tm(); + if (accept_grammar) { llama_sampler_accept(gsmpl->grmr, token); } @@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo } void common_sampler_reset(struct common_sampler * gsmpl) { - llama_sampler_reset(gsmpl->grmr); - - llama_sampler_reset(gsmpl->chain); + gsmpl->reset(); } struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { @@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) { void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) { // TODO: measure grammar performance + const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0; + + llama_perf_sampler_data data_smpl; + llama_perf_context_data data_ctx; + + memset(&data_smpl, 0, sizeof(data_smpl)); + memset(&data_ctx, 0, sizeof(data_ctx)); + if (gsmpl) { - llama_perf_sampler_print(gsmpl->chain); + auto & data = data_smpl; + + data = llama_perf_sampler(gsmpl->chain); + + // note: the sampling time includes the samplers time + extra time spent in common/sampling + LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms); + LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample); } + if (ctx) { - llama_perf_context_print(ctx); + auto & data = data_ctx; + + data = llama_perf_context(ctx); + + const double t_end_ms = 1e-3 * ggml_time_us(); + + const double t_total_ms = t_end_ms - data.t_start_ms; + const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms); + const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms; + + LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms); + LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval); + LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", + __func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval); + LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval)); + LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc); + LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused); + llama_memory_breakdown_print(ctx); } } llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) { + llama_synchronize(ctx); + + // start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations + const auto tm = gsmpl->tm(); + gsmpl->set_logits(ctx, idx); auto & grmr = gsmpl->grmr; @@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) { // helpers llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) { + const auto tm = gsmpl->tm(); + auto * res = &gsmpl->cur_p; if (do_sort && !res->sorted) { diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp index cefa39a57c886..80c693ce61c01 100644 --- a/examples/eval-callback/eval-callback.cpp +++ b/examples/eval-callback/eval-callback.cpp @@ -4,10 +4,10 @@ #include "llama.h" #include "ggml.h" +#include #include #include #include -#include /** * This the arbitrary data which will be passed to each callback. @@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { return u.f; } -static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { +static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) { size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; float v; if (type == GGML_TYPE_F16) { - v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); + v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]); } else if (type == GGML_TYPE_F32) { - v = *(float *) &data[i]; + v = *(const float *) &data[i]; } else if (type == GGML_TYPE_I64) { - v = (float) *(int64_t *) &data[i]; + v = (float) *(const int64_t *) &data[i]; } else if (type == GGML_TYPE_I32) { - v = (float) *(int32_t *) &data[i]; + v = (float) *(const int32_t *) &data[i]; } else if (type == GGML_TYPE_I16) { - v = (float) *(int16_t *) &data[i]; + v = (float) *(const int16_t *) &data[i]; } else if (type == GGML_TYPE_I8) { - v = (float) *(int8_t *) &data[i]; + v = (float) *(const int8_t *) &data[i]; } else if (type == GGML_TYPE_BF16) { - v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]); + v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]); } else { GGML_ABORT("fatal error"); } diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp index 6ec709dd323a6..c7a1880aad3db 100644 --- a/src/llama-impl.cpp +++ b/src/llama-impl.cpp @@ -20,10 +20,10 @@ static llama_logger_state g_logger_state; time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {} time_meas::~time_meas() { - if (t_start_us >= 0) { - t_acc += ggml_time_us() - t_start_us; - } + if (t_start_us >= 0) { + t_acc += ggml_time_us() - t_start_us; } +} void llama_log_set(ggml_log_callback log_callback, void * user_data) { ggml_log_set(log_callback, user_data); diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index adb3f8810ed33..3f4a729bc36c7 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) { for (auto * smpl : chain->samplers) { llama_sampler_reset(smpl); } - - chain->t_sample_us = 0; - chain->n_sample = 0; } static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) { @@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c void llama_perf_sampler_print(const struct llama_sampler * chain) { const auto data = llama_perf_sampler(chain); - LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", - __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample); + LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample); } void llama_perf_sampler_reset(struct llama_sampler * chain) { @@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) { auto * ctx = (struct llama_sampler_chain *) chain->ctx; - ctx->t_sample_us = ctx->n_sample = 0; + ctx->t_sample_us = 0; + ctx->n_sample = 0; } diff --git a/tools/main/main.cpp b/tools/main/main.cpp index 33e8862335793..78b42267b59c3 100644 --- a/tools/main/main.cpp +++ b/tools/main/main.cpp @@ -147,11 +147,15 @@ int main(int argc, char ** argv) { return 1; } - auto * mem = llama_get_memory(ctx); - + llama_memory_t mem = llama_get_memory(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); + + // note: the time for chat template initialization is not negligible: auto chat_templates = common_chat_templates_init(model, params.chat_template); + // start measuring performance timings from here + llama_perf_context_reset(ctx); + LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);