Skip to content
66 changes: 66 additions & 0 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,18 @@
#include <limits>
#include <stdexcept>

#include "thermal_control.h"

//
// llama_context
//

std::atomic<int> g_token_count{0};
std::atomic<bool> g_monitoring{false};
std::thread g_monitor_thread;
std::ofstream g_csv;


llama_context::llama_context(
const llama_model & model,
llama_context_params params) :
Expand Down Expand Up @@ -958,6 +966,8 @@ int llama_context::encode(const llama_batch & batch_inp) {
int llama_context::decode(const llama_batch & batch_inp) {
GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT

thermal_control_check();

if (!memory) {
LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__);
return encode(batch_inp);
Expand All @@ -968,6 +978,62 @@ int llama_context::decode(const llama_batch & batch_inp) {
return -1;
}

static bool is_first = true;
if (is_first) {
is_first = false;

// CSV 파일 열기
g_csv.open("throughput.csv");
g_csv << "timestamp,elapsed_sec,tokens_per_sec,total_tokens\n";

// 모니터링 스레드 시작
g_monitoring = true;
g_token_count = 0;

auto start = std::chrono::steady_clock::now();

g_monitor_thread = std::thread([start]() {
int last_count = 0;

while (g_monitoring) {
std::this_thread::sleep_for(std::chrono::seconds(1));
if (!g_monitoring) break;

auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
now - start).count() / 1000.0;

int current = g_token_count.load();
int per_sec = current - last_count;

auto ts = std::chrono::system_clock::now().time_since_epoch();
auto ts_ms = std::chrono::duration_cast<std::chrono::milliseconds>(ts).count();

g_csv << ts_ms << "," << elapsed << "," << per_sec << "," << current << "\n";
g_csv.flush();

fprintf(stderr, "[%.1fs] %d tok/s\n", elapsed, per_sec);

last_count = current;
}

g_csv.close();
});

// 프로그램 종료 시 자동으로 정리
std::atexit([]() {
g_monitoring = false;
if (g_monitor_thread.joinable()) {
g_monitor_thread.join();
}
});

LLAMA_LOG_INFO("Throughput monitoring started\n");
}

// 토큰 카운트 증가
g_token_count.fetch_add(batch_inp.n_tokens);

const auto & vocab = model.vocab;
const auto & hparams = model.hparams;

Expand Down
11 changes: 11 additions & 0 deletions src/llama-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,14 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t);
std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i);

#define LLAMA_TENSOR_NAME_FATTN "__fattn__"


#include <atomic>
#include <thread>
#include <fstream>

// Throughput monitoring
extern std::atomic<int> g_token_count;
extern std::atomic<bool> g_monitoring;
extern std::thread g_monitor_thread;
extern std::ofstream g_csv;
Loading