Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
#include <sstream>
#include <string>
#include <thread>
#include <unordered_map>
#include <unordered_set>
#include <vector>

Expand Down Expand Up @@ -60,6 +59,14 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

common_time_meas::common_time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}

common_time_meas::~common_time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}

//
// CPU utils
//
Expand Down
17 changes: 12 additions & 5 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,15 @@

#pragma once

#include "ggml-opt.h"
#include "llama-cpp.h"

#include <set>
#include <sstream>
#include <string>
#include <string_view>
#include <vector>
#include <map>
#include <sstream>
#include <cmath>

#include "ggml-opt.h"
#include "llama-cpp.h"

#ifdef _WIN32
#define DIRECTORY_SEPARATOR '\\'
Expand All @@ -30,6 +28,15 @@

#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"

struct common_time_meas {
common_time_meas(int64_t & t_acc, bool disable = false);
~common_time_meas();

const int64_t t_start_us;

int64_t & t_acc;
};

Comment on lines +31 to +39
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This struct is effectively a code dup of time_meas defined in llama-impl.h. Not sure if this is something we would like to avoid

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's OK to duplicate as it is quite simple functionality

struct common_adapter_lora_info {
std::string path;
float scale;
Expand Down
66 changes: 60 additions & 6 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
#include "common.h"
#include "log.h"

#include <algorithm>
#include <cmath>
#include <cstring>
#include <unordered_map>
#include <algorithm>

// the ring buffer works similarly to std::deque, but with a fixed capacity
// TODO: deduplicate with llama-impl.h
Expand Down Expand Up @@ -112,6 +113,13 @@ struct common_sampler {

llama_token_data_array cur_p;

void reset() {
prev.clear();

llama_sampler_reset(grmr);
llama_sampler_reset(chain);
}

void set_logits(struct llama_context * ctx, int idx) {
const auto * logits = llama_get_logits_ith(ctx, idx);

Expand All @@ -128,6 +136,12 @@ struct common_sampler {

cur_p = { cur.data(), cur.size(), -1, false };
}

common_time_meas tm() {
return common_time_meas(t_total_us, params.no_perf);
}

mutable int64_t t_total_us = 0;
};

std::string common_params_sampling::print() const {
Expand Down Expand Up @@ -298,6 +312,8 @@ void common_sampler_free(struct common_sampler * gsmpl) {
}

void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
const auto tm = gsmpl->tm();

if (accept_grammar) {
llama_sampler_accept(gsmpl->grmr, token);
}
Expand All @@ -308,9 +324,7 @@ void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, boo
}

void common_sampler_reset(struct common_sampler * gsmpl) {
llama_sampler_reset(gsmpl->grmr);

llama_sampler_reset(gsmpl->chain);
gsmpl->reset();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously we did not reset prev ringbuffer, I presume this is a bugfix?

}

struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
Expand All @@ -327,16 +341,54 @@ struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
// TODO: measure grammar performance

const double t_sampling_ms = gsmpl ? 1e-3*gsmpl->t_total_us : 0;

llama_perf_sampler_data data_smpl;
llama_perf_context_data data_ctx;

memset(&data_smpl, 0, sizeof(data_smpl));
memset(&data_ctx, 0, sizeof(data_ctx));

if (gsmpl) {
llama_perf_sampler_print(gsmpl->chain);
auto & data = data_smpl;

data = llama_perf_sampler(gsmpl->chain);

// note: the sampling time includes the samplers time + extra time spent in common/sampling
LOG_INF("%s: sampling time = %10.2f ms\n", __func__, t_sampling_ms);
LOG_INF("%s: samplers time = %10.2f ms / %5d tokens\n", __func__, data.t_sample_ms, data.n_sample);
}

if (ctx) {
llama_perf_context_print(ctx);
auto & data = data_ctx;

data = llama_perf_context(ctx);

const double t_end_ms = 1e-3 * ggml_time_us();

const double t_total_ms = t_end_ms - data.t_start_ms;
const double t_unacc_ms = t_total_ms - (t_sampling_ms + data.t_p_eval_ms + data.t_eval_ms);
const double t_unacc_pc = 100.0 * t_unacc_ms / t_total_ms;

LOG_INF("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
LOG_INF("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
LOG_INF("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
LOG_INF("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
LOG_INF("%s: unaccounted time = %10.2f ms / %5.1f %% (total - sampling - prompt eval - eval) / (total)\n", __func__, t_unacc_ms, t_unacc_pc);
LOG_INF("%s: graphs reused = %10d\n", __func__, data.n_reused);

llama_memory_breakdown_print(ctx);
}
}

llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
llama_synchronize(ctx);

// start measuring sampling time after the llama_context synchronization in order to not measure any ongoing async operations
const auto tm = gsmpl->tm();

gsmpl->set_logits(ctx, idx);

auto & grmr = gsmpl->grmr;
Expand Down Expand Up @@ -428,6 +480,8 @@ uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
// helpers

llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl, bool do_sort) {
const auto tm = gsmpl->tm();

auto * res = &gsmpl->cur_p;

if (do_sort && !res->sorted) {
Expand Down
18 changes: 9 additions & 9 deletions examples/eval-callback/eval-callback.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
#include "llama.h"
#include "ggml.h"

#include <cmath>
#include <cstdio>
#include <string>
#include <vector>
#include <numeric>

/**
* This the arbitrary data which will be passed to each callback.
Expand Down Expand Up @@ -37,23 +37,23 @@ static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
return u.f;
}

static float ggml_get_float_value(uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
static float ggml_get_float_value(const uint8_t * data, ggml_type type, const size_t * nb, size_t i0, size_t i1, size_t i2, size_t i3) {
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
v = ggml_fp16_to_fp32(*(const ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) &data[i];
v = *(const float *) &data[i];
} else if (type == GGML_TYPE_I64) {
v = (float) *(int64_t *) &data[i];
v = (float) *(const int64_t *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) &data[i];
v = (float) *(const int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) &data[i];
v = (float) *(const int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) &data[i];
v = (float) *(const int8_t *) &data[i];
} else if (type == GGML_TYPE_BF16) {
v = ggml_compute_bf16_to_fp32(*(ggml_bf16_t *) &data[i]);
v = ggml_compute_bf16_to_fp32(*(const ggml_bf16_t *) &data[i]);
} else {
GGML_ABORT("fatal error");
}
Expand Down
6 changes: 3 additions & 3 deletions src/llama-impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ static llama_logger_state g_logger_state;
time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}

time_meas::~time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}

void llama_log_set(ggml_log_callback log_callback, void * user_data) {
ggml_log_set(log_callback, user_data);
Expand Down
9 changes: 3 additions & 6 deletions src/llama-sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,6 @@ static void llama_sampler_chain_reset(struct llama_sampler * smpl) {
for (auto * smpl : chain->samplers) {
llama_sampler_reset(smpl);
}

chain->t_sample_us = 0;
chain->n_sample = 0;
}

static struct llama_sampler * llama_sampler_chain_clone(const struct llama_sampler * smpl) {
Expand Down Expand Up @@ -2670,8 +2667,7 @@ struct llama_perf_sampler_data llama_perf_sampler(const struct llama_sampler * c
void llama_perf_sampler_print(const struct llama_sampler * chain) {
const auto data = llama_perf_sampler(chain);

LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
LLAMA_LOG_INFO("%s: samplers time = %10.2f ms / %5d runs\n", __func__, data.t_sample_ms, data.n_sample);
}

void llama_perf_sampler_reset(struct llama_sampler * chain) {
Expand All @@ -2681,5 +2677,6 @@ void llama_perf_sampler_reset(struct llama_sampler * chain) {

auto * ctx = (struct llama_sampler_chain *) chain->ctx;

ctx->t_sample_us = ctx->n_sample = 0;
ctx->t_sample_us = 0;
ctx->n_sample = 0;
}
8 changes: 6 additions & 2 deletions tools/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,15 @@ int main(int argc, char ** argv) {
return 1;
}

auto * mem = llama_get_memory(ctx);

llama_memory_t mem = llama_get_memory(ctx);
const llama_vocab * vocab = llama_model_get_vocab(model);

// note: the time for chat template initialization is not negligible:
auto chat_templates = common_chat_templates_init(model, params.chat_template);

// start measuring performance timings from here
llama_perf_context_reset(ctx);

Comment on lines +152 to +158
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't realize until now that this chat template initialization call can take a significant amount of time (tens of milliseconds). Accounting for this, now the reported timings for sampling, prompt eval, eval and total add up nicely.

LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads);

auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
Expand Down
Loading