From b867c8fc1fd21ef5056a8db247a6e9da1c797cc2 Mon Sep 17 00:00:00 2001 From: Aiden Date: Mon, 27 Oct 2025 16:09:55 -0400 Subject: [PATCH 01/14] temperature throttler added --- src/llama-context.cpp | 4 ++ src/thermal_control.h | 111 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 src/thermal_control.h diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d8a8b5e647a85..e07b7e5273830 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -12,6 +12,8 @@ #include #include +#include "thermal_control.h" + // // llama_context // @@ -958,6 +960,8 @@ int llama_context::encode(const llama_batch & batch_inp) { int llama_context::decode(const llama_batch & batch_inp) { GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT + thermal_control_check(); + if (!memory) { LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__); return encode(batch_inp); diff --git a/src/thermal_control.h b/src/thermal_control.h new file mode 100644 index 0000000000000..9bf7ef4fc2729 --- /dev/null +++ b/src/thermal_control.h @@ -0,0 +1,111 @@ +// thermal_control.h +#ifndef THERMAL_CONTROL_H +#define THERMAL_CONTROL_H + +#include // fprintf, stderr, snprintf 추가 +#include +#include +#include +#include +#include +#include // bool 타입 추가 + +#define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" +#define TEMP_THRESHOLD_MC 70000 // 70도 = 70000 millidegree C +#define TARGET_CPU_FREQ 1785600 + +// GPU 온도 읽기 (millidegree C) +static inline int read_gpu_temp() { + int fd = open(GPU_TEMP_PATH, O_RDONLY); + if (fd < 0) return -1; + + char buf[32]; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + + if (n <= 0) return -1; + buf[n] = '\0'; + + // Fast parse + int temp = 0; + for (int i = 0; i < n && buf[i] >= '0' && buf[i] <= '9'; i++) { + temp = temp * 10 + (buf[i] - '0'); + } + return temp; +} + +// CPU frequency 설정 +static inline bool set_cpu_freq(int freq_khz) { + DIR *dir = opendir("/sys/devices/system/cpu"); + if (!dir) return false; + + bool success = true; + struct dirent *entry; + + while ((entry = readdir(dir)) != NULL) { + // cpu0, cpu1, ... 찾기 + if (strncmp(entry->d_name, "cpu", 3) != 0) continue; + if (entry->d_name[3] < '0' || entry->d_name[3] > '9') continue; + + char min_path[256], max_path[256]; + snprintf(min_path, sizeof(min_path), + "/sys/devices/system/cpu/%s/cpufreq/scaling_min_freq", + entry->d_name); + snprintf(max_path, sizeof(max_path), + "/sys/devices/system/cpu/%s/cpufreq/scaling_max_freq", + entry->d_name); + + // scaling_max_freq 먼저 설정 (min보다 높아야 함) + int fd_max = open(max_path, O_WRONLY); + if (fd_max >= 0) { + char freq_str[16]; + int len = snprintf(freq_str, sizeof(freq_str), "%d", freq_khz); + write(fd_max, freq_str, len); + close(fd_max); + } else { + success = false; + } + + // scaling_min_freq 설정 + int fd_min = open(min_path, O_WRONLY); + if (fd_min >= 0) { + char freq_str[16]; + int len = snprintf(freq_str, sizeof(freq_str), "%d", freq_khz); + write(fd_min, freq_str, len); + close(fd_min); + } else { + success = false; + } + } + + closedir(dir); + return success; +} + +// 온도 기반 thermal control +static inline void thermal_control_check() { + static int last_temp = -1; + static bool throttled = false; + + int temp_mc = read_gpu_temp(); + if (temp_mc < 0) return; // 읽기 실패시 무시 + + // 70도 이상이면 throttle + if (temp_mc >= TEMP_THRESHOLD_MC && !throttled) { + set_cpu_freq(TARGET_CPU_FREQ); + throttled = true; + fprintf(stderr, "Thermal: GPU temp %.1f°C >= 70°C, throttling CPU to %d KHz\n", + temp_mc / 1000.0, TARGET_CPU_FREQ); + } + // 65도 이하로 내려가면 throttle 해제 (hysteresis) + else if (temp_mc < (TEMP_THRESHOLD_MC - 5000) && throttled) { + // 원하는 경우 여기서 원래 frequency로 복구 가능 + throttled = false; + fprintf(stderr, "Thermal: GPU temp %.1f°C < 65°C, releasing throttle\n", + temp_mc / 1000.0); + } + + last_temp = temp_mc; +} + +#endif // THERMAL_CONTROL_H \ No newline at end of file From 5b5b8b99baf27c85c2a2e804513426d86ce3cc4c Mon Sep 17 00:00:00 2001 From: Aiden Date: Mon, 27 Oct 2025 18:05:53 -0400 Subject: [PATCH 02/14] temperature fixed --- src/thermal_control.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index 9bf7ef4fc2729..059e2f70f8df4 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -11,8 +11,8 @@ #include // bool 타입 추가 #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" -#define TEMP_THRESHOLD_MC 70000 // 70도 = 70000 millidegree C -#define TARGET_CPU_FREQ 1785600 +#define TEMP_THRESHOLD_MC 60000 // 60도 = 60000 millidegree C +#define TARGET_CPU_FREQ 960000 // GPU 온도 읽기 (millidegree C) static inline int read_gpu_temp() { @@ -94,14 +94,14 @@ static inline void thermal_control_check() { if (temp_mc >= TEMP_THRESHOLD_MC && !throttled) { set_cpu_freq(TARGET_CPU_FREQ); throttled = true; - fprintf(stderr, "Thermal: GPU temp %.1f°C >= 70°C, throttling CPU to %d KHz\n", + fprintf(stderr, "Thermal: GPU temp %.1f°C >= 60°C, throttling CPU to %d KHz\n", temp_mc / 1000.0, TARGET_CPU_FREQ); } // 65도 이하로 내려가면 throttle 해제 (hysteresis) else if (temp_mc < (TEMP_THRESHOLD_MC - 5000) && throttled) { // 원하는 경우 여기서 원래 frequency로 복구 가능 throttled = false; - fprintf(stderr, "Thermal: GPU temp %.1f°C < 65°C, releasing throttle\n", + fprintf(stderr, "Thermal: GPU temp %.1f°C < 55°C, releasing throttle\n", temp_mc / 1000.0); } From 64d603fa843c189cb493f76d1c206f4b84a5409a Mon Sep 17 00:00:00 2001 From: Aiden Date: Mon, 27 Oct 2025 19:16:55 -0400 Subject: [PATCH 03/14] throughput monitoring added --- src/llama-context.cpp | 64 +++++++++++++++++- src/llama-impl.h | 11 +++ src/thermal_control.h | 154 +++++++++++++++++++++++++++++------------- 3 files changed, 181 insertions(+), 48 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e07b7e5273830..c66d4f4a55673 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -18,6 +18,12 @@ // llama_context // +std::atomic g_token_count{0}; +std::atomic g_monitoring{false}; +std::thread g_monitor_thread; +std::ofstream g_csv; + + llama_context::llama_context( const llama_model & model, llama_context_params params) : @@ -960,7 +966,7 @@ int llama_context::encode(const llama_batch & batch_inp) { int llama_context::decode(const llama_batch & batch_inp) { GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT - thermal_control_check(); + // thermal_control_check(); if (!memory) { LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__); @@ -972,6 +978,62 @@ int llama_context::decode(const llama_batch & batch_inp) { return -1; } + static bool is_first = true; + if (is_first) { + is_first = false; + + // CSV 파일 열기 + g_csv.open("throughput.csv"); + g_csv << "timestamp,elapsed_sec,tokens_per_sec,total_tokens\n"; + + // 모니터링 스레드 시작 + g_monitoring = true; + g_token_count = 0; + + auto start = std::chrono::steady_clock::now(); + + g_monitor_thread = std::thread([start]() { + int last_count = 0; + + while (g_monitoring) { + std::this_thread::sleep_for(std::chrono::seconds(1)); + if (!g_monitoring) break; + + auto now = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast( + now - start).count() / 1000.0; + + int current = g_token_count.load(); + int per_sec = current - last_count; + + auto ts = std::chrono::system_clock::now().time_since_epoch(); + auto ts_ms = std::chrono::duration_cast(ts).count(); + + g_csv << ts_ms << "," << elapsed << "," << per_sec << "," << current << "\n"; + g_csv.flush(); + + fprintf(stderr, "[%.1fs] %d tok/s\n", elapsed, per_sec); + + last_count = current; + } + + g_csv.close(); + }); + + // 프로그램 종료 시 자동으로 정리 + std::atexit([]() { + g_monitoring = false; + if (g_monitor_thread.joinable()) { + g_monitor_thread.join(); + } + }); + + LLAMA_LOG_INFO("Throughput monitoring started\n"); + } + + // 토큰 카운트 증가 + g_token_count.fetch_add(batch_inp.n_tokens); + const auto & vocab = model.vocab; const auto & hparams = model.hparams; diff --git a/src/llama-impl.h b/src/llama-impl.h index c5163e9225a5e..3cc2da88547b5 100644 --- a/src/llama-impl.h +++ b/src/llama-impl.h @@ -61,3 +61,14 @@ std::string llama_format_tensor_shape(const struct ggml_tensor * t); std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i); #define LLAMA_TENSOR_NAME_FATTN "__fattn__" + + +#include +#include +#include + +// Throughput monitoring +extern std::atomic g_token_count; +extern std::atomic g_monitoring; +extern std::thread g_monitor_thread; +extern std::ofstream g_csv; \ No newline at end of file diff --git a/src/thermal_control.h b/src/thermal_control.h index 059e2f70f8df4..2e5dae7c0677a 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -2,27 +2,32 @@ #ifndef THERMAL_CONTROL_H #define THERMAL_CONTROL_H -#include // fprintf, stderr, snprintf 추가 +#include #include #include #include #include #include -#include // bool 타입 추가 +#include +#include // 🔥 추가 +#include // 🔥 추가 #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" #define TEMP_THRESHOLD_MC 60000 // 60도 = 60000 millidegree C #define TARGET_CPU_FREQ 960000 +#define CHECK_INTERVAL 10 // 10 토큰마다 한 번 체크 -// GPU 온도 읽기 (millidegree C) -static inline int read_gpu_temp() { - int fd = open(GPU_TEMP_PATH, O_RDONLY); +// 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) +extern std::ofstream g_csv; + +// GPU 온도 읽기 (millidegree C) - FD 재사용 +static inline int read_gpu_temp_fast(int fd) { if (fd < 0) return -1; + if (lseek(fd, 0, SEEK_SET) < 0) return -1; + char buf[32]; ssize_t n = read(fd, buf, sizeof(buf) - 1); - close(fd); - if (n <= 0) return -1; buf[n] = '\0'; @@ -34,16 +39,45 @@ static inline int read_gpu_temp() { return temp; } -// CPU frequency 설정 -static inline bool set_cpu_freq(int freq_khz) { - DIR *dir = opendir("/sys/devices/system/cpu"); - if (!dir) return false; +// CPU frequency 설정 - 캐시된 FD 사용 +static inline bool set_cpu_freq_cached(int *fd_cache, int num_fds, int freq_khz) { + if (num_fds == 0) return false; + + char freq_str[16]; + int len = snprintf(freq_str, sizeof(freq_str), "%d", freq_khz); bool success = true; + for (int i = 0; i < num_fds; i += 2) { + // max 먼저 + if (fd_cache[i + 1] >= 0) { + if (lseek(fd_cache[i + 1], 0, SEEK_SET) >= 0) { + write(fd_cache[i + 1], freq_str, len); + } else { + success = false; + } + } + // min 나중에 + if (fd_cache[i] >= 0) { + if (lseek(fd_cache[i], 0, SEEK_SET) >= 0) { + write(fd_cache[i], freq_str, len); + } else { + success = false; + } + } + } + + return success; +} + +// 초기화: CPU FD들을 미리 열어두기 +static inline int init_cpu_freq_fds(int *fd_cache, int max_fds) { + DIR *dir = opendir("/sys/devices/system/cpu"); + if (!dir) return 0; + + int count = 0; struct dirent *entry; - while ((entry = readdir(dir)) != NULL) { - // cpu0, cpu1, ... 찾기 + while ((entry = readdir(dir)) != NULL && count < max_fds - 1) { if (strncmp(entry->d_name, "cpu", 3) != 0) continue; if (entry->d_name[3] < '0' || entry->d_name[3] > '9') continue; @@ -55,57 +89,83 @@ static inline bool set_cpu_freq(int freq_khz) { "/sys/devices/system/cpu/%s/cpufreq/scaling_max_freq", entry->d_name); - // scaling_max_freq 먼저 설정 (min보다 높아야 함) - int fd_max = open(max_path, O_WRONLY); - if (fd_max >= 0) { - char freq_str[16]; - int len = snprintf(freq_str, sizeof(freq_str), "%d", freq_khz); - write(fd_max, freq_str, len); - close(fd_max); - } else { - success = false; - } - - // scaling_min_freq 설정 - int fd_min = open(min_path, O_WRONLY); - if (fd_min >= 0) { - char freq_str[16]; - int len = snprintf(freq_str, sizeof(freq_str), "%d", freq_khz); - write(fd_min, freq_str, len); - close(fd_min); - } else { - success = false; - } + fd_cache[count++] = open(min_path, O_WRONLY); + fd_cache[count++] = open(max_path, O_WRONLY); } closedir(dir); - return success; + return count; +} + +// 🔥 CSV에 thermal 이벤트 기록 +static inline void log_thermal_event(const char* event, double temp_celsius, int freq_khz = 0) { + if (!g_csv.is_open()) return; + + auto ts = std::chrono::system_clock::now().time_since_epoch(); + auto ts_ms = std::chrono::duration_cast(ts).count(); + + // CSV 형식: timestamp,-1,event_type,temp,freq + g_csv << ts_ms << ",-1," << event << "," << temp_celsius << "," << freq_khz << "\n"; + g_csv.flush(); } -// 온도 기반 thermal control +// 온도 기반 thermal control - 최적화 버전 static inline void thermal_control_check() { - static int last_temp = -1; + static int gpu_temp_fd = -1; + static int cpu_freq_fds[32]; // max 16 CPUs = 32 FDs (min/max pairs) + static int num_cpu_fds = 0; + static bool initialized = false; + static int call_count = 0; static bool throttled = false; - int temp_mc = read_gpu_temp(); - if (temp_mc < 0) return; // 읽기 실패시 무시 + // 초기화 + if (!initialized) { + gpu_temp_fd = open(GPU_TEMP_PATH, O_RDONLY); + num_cpu_fds = init_cpu_freq_fds(cpu_freq_fds, 32); + initialized = true; + if (gpu_temp_fd < 0 || num_cpu_fds == 0) { + fprintf(stderr, "Thermal: Failed to initialize (fd=%d, num_fds=%d)\n", + gpu_temp_fd, num_cpu_fds); + } + } + + // CHECK_INTERVAL 토큰마다만 체크 + call_count++; + if (call_count % CHECK_INTERVAL != 0) { + return; + } - // 70도 이상이면 throttle + int temp_mc = read_gpu_temp_fast(gpu_temp_fd); + if (temp_mc < 0) return; + + double temp_celsius = temp_mc / 1000.0; // 🔥 추가 + + // 60도 이상이면 throttle if (temp_mc >= TEMP_THRESHOLD_MC && !throttled) { - set_cpu_freq(TARGET_CPU_FREQ); + set_cpu_freq_cached(cpu_freq_fds, num_cpu_fds, TARGET_CPU_FREQ); throttled = true; + + // 🔥 콘솔 출력 fprintf(stderr, "Thermal: GPU temp %.1f°C >= 60°C, throttling CPU to %d KHz\n", - temp_mc / 1000.0, TARGET_CPU_FREQ); + temp_celsius, TARGET_CPU_FREQ); + + // 🔥 CSV에 기록 + log_thermal_event("THROTTLE", temp_celsius, TARGET_CPU_FREQ); } - // 65도 이하로 내려가면 throttle 해제 (hysteresis) + // 55도 이하로 내려가면 throttle 해제 else if (temp_mc < (TEMP_THRESHOLD_MC - 5000) && throttled) { - // 원하는 경우 여기서 원래 frequency로 복구 가능 throttled = false; + + // 🔥 콘솔 출력 fprintf(stderr, "Thermal: GPU temp %.1f°C < 55°C, releasing throttle\n", - temp_mc / 1000.0); + temp_celsius); + + // 🔥 CSV에 기록 + log_thermal_event("RELEASE", temp_celsius, 0); + + // 필요시 원래 frequency로 복구 + // set_cpu_freq_cached(cpu_freq_fds, num_cpu_fds, ORIGINAL_FREQ); } - - last_temp = temp_mc; } #endif // THERMAL_CONTROL_H \ No newline at end of file From e2158ae2a41756e23e09e8d18841d65193bc3520 Mon Sep 17 00:00:00 2001 From: Aiden Date: Mon, 27 Oct 2025 20:01:06 -0400 Subject: [PATCH 04/14] thermal check --- src/llama-context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c66d4f4a55673..b1b76eb5ef76e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -966,7 +966,7 @@ int llama_context::encode(const llama_batch & batch_inp) { int llama_context::decode(const llama_batch & batch_inp) { GGML_ASSERT((!batch_inp.token && batch_inp.embd) || (batch_inp.token && !batch_inp.embd)); // NOLINT - // thermal_control_check(); + thermal_control_check(); if (!memory) { LLAMA_LOG_DEBUG("%s: cannot decode batches with this context (calling encode() instead)\n", __func__); From a79a312c1b6c444e4d409cf035ce0ab17c7cac13 Mon Sep 17 00:00:00 2001 From: Aiden Date: Mon, 27 Oct 2025 23:11:42 -0400 Subject: [PATCH 05/14] fixing thermal control bug - not being able to change freq --- src/thermal_control.h | 152 +++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 99 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index 2e5dae7c0677a..39fa4ed6d49cf 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -7,27 +7,23 @@ #include #include #include -#include #include -#include // 🔥 추가 -#include // 🔥 추가 +#include #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" #define TEMP_THRESHOLD_MC 60000 // 60도 = 60000 millidegree C #define TARGET_CPU_FREQ 960000 #define CHECK_INTERVAL 10 // 10 토큰마다 한 번 체크 -// 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) -extern std::ofstream g_csv; - -// GPU 온도 읽기 (millidegree C) - FD 재사용 -static inline int read_gpu_temp_fast(int fd) { +// GPU 온도 읽기 (millidegree C) +static inline int read_gpu_temp() { + int fd = open(GPU_TEMP_PATH, O_RDONLY); if (fd < 0) return -1; - if (lseek(fd, 0, SEEK_SET) < 0) return -1; - char buf[32]; ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (n <= 0) return -1; buf[n] = '\0'; @@ -39,132 +35,90 @@ static inline int read_gpu_temp_fast(int fd) { return temp; } -// CPU frequency 설정 - 캐시된 FD 사용 -static inline bool set_cpu_freq_cached(int *fd_cache, int num_fds, int freq_khz) { - if (num_fds == 0) return false; - - char freq_str[16]; - int len = snprintf(freq_str, sizeof(freq_str), "%d", freq_khz); - - bool success = true; - for (int i = 0; i < num_fds; i += 2) { - // max 먼저 - if (fd_cache[i + 1] >= 0) { - if (lseek(fd_cache[i + 1], 0, SEEK_SET) >= 0) { - write(fd_cache[i + 1], freq_str, len); - } else { - success = false; - } - } - // min 나중에 - if (fd_cache[i] >= 0) { - if (lseek(fd_cache[i], 0, SEEK_SET) >= 0) { - write(fd_cache[i], freq_str, len); - } else { - success = false; - } - } +// CPU frequency 설정 - echo처럼 +static inline bool set_cpu_freq(int freq_khz) { + DIR *dir = opendir("/sys/devices/system/cpu"); + if (!dir) { + fprintf(stderr, "Thermal: Cannot open /sys/devices/system/cpu: %s\n", strerror(errno)); + return false; } - return success; -} - -// 초기화: CPU FD들을 미리 열어두기 -static inline int init_cpu_freq_fds(int *fd_cache, int max_fds) { - DIR *dir = opendir("/sys/devices/system/cpu"); - if (!dir) return 0; + char freq_str[32]; + snprintf(freq_str, sizeof(freq_str), "%d\n", freq_khz); - int count = 0; + bool success = false; struct dirent *entry; - while ((entry = readdir(dir)) != NULL && count < max_fds - 1) { + while ((entry = readdir(dir)) != NULL) { + // cpu0, cpu1, ... 찾기 if (strncmp(entry->d_name, "cpu", 3) != 0) continue; - if (entry->d_name[3] < '0' || entry->d_name[3] > '9') continue; + char c = entry->d_name[3]; + if (c < '0' || c > '9') continue; - char min_path[256], max_path[256]; - snprintf(min_path, sizeof(min_path), - "/sys/devices/system/cpu/%s/cpufreq/scaling_min_freq", - entry->d_name); - snprintf(max_path, sizeof(max_path), + char path[256]; + + // scaling_max_freq 먼저 (min보다 크거나 같아야 함) + snprintf(path, sizeof(path), "/sys/devices/system/cpu/%s/cpufreq/scaling_max_freq", entry->d_name); + int fd = open(path, O_WRONLY | O_TRUNC); + if (fd >= 0) { + write(fd, freq_str, strlen(freq_str)); + close(fd); + success = true; + } - fd_cache[count++] = open(min_path, O_WRONLY); - fd_cache[count++] = open(max_path, O_WRONLY); + // scaling_min_freq + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/%s/cpufreq/scaling_min_freq", + entry->d_name); + fd = open(path, O_WRONLY | O_TRUNC); + if (fd >= 0) { + write(fd, freq_str, strlen(freq_str)); + close(fd); + } } closedir(dir); - return count; -} - -// 🔥 CSV에 thermal 이벤트 기록 -static inline void log_thermal_event(const char* event, double temp_celsius, int freq_khz = 0) { - if (!g_csv.is_open()) return; - - auto ts = std::chrono::system_clock::now().time_since_epoch(); - auto ts_ms = std::chrono::duration_cast(ts).count(); - - // CSV 형식: timestamp,-1,event_type,temp,freq - g_csv << ts_ms << ",-1," << event << "," << temp_celsius << "," << freq_khz << "\n"; - g_csv.flush(); + return success; } -// 온도 기반 thermal control - 최적화 버전 +// 온도 기반 thermal control static inline void thermal_control_check() { - static int gpu_temp_fd = -1; - static int cpu_freq_fds[32]; // max 16 CPUs = 32 FDs (min/max pairs) - static int num_cpu_fds = 0; static bool initialized = false; static int call_count = 0; static bool throttled = false; - // 초기화 if (!initialized) { - gpu_temp_fd = open(GPU_TEMP_PATH, O_RDONLY); - num_cpu_fds = init_cpu_freq_fds(cpu_freq_fds, 32); initialized = true; - if (gpu_temp_fd < 0 || num_cpu_fds == 0) { - fprintf(stderr, "Thermal: Failed to initialize (fd=%d, num_fds=%d)\n", - gpu_temp_fd, num_cpu_fds); - } + fprintf(stderr, "Thermal: Control initialized\n"); } - // CHECK_INTERVAL 토큰마다만 체크 call_count++; if (call_count % CHECK_INTERVAL != 0) { return; } - int temp_mc = read_gpu_temp_fast(gpu_temp_fd); + int temp_mc = read_gpu_temp(); if (temp_mc < 0) return; - double temp_celsius = temp_mc / 1000.0; // 🔥 추가 - // 60도 이상이면 throttle if (temp_mc >= TEMP_THRESHOLD_MC && !throttled) { - set_cpu_freq_cached(cpu_freq_fds, num_cpu_fds, TARGET_CPU_FREQ); - throttled = true; + fprintf(stderr, "Thermal: GPU %.1f°C >= 60°C, throttling CPU to %d KHz\n", + temp_mc / 1000.0, TARGET_CPU_FREQ); - // 🔥 콘솔 출력 - fprintf(stderr, "Thermal: GPU temp %.1f°C >= 60°C, throttling CPU to %d KHz\n", - temp_celsius, TARGET_CPU_FREQ); - - // 🔥 CSV에 기록 - log_thermal_event("THROTTLE", temp_celsius, TARGET_CPU_FREQ); + if (set_cpu_freq(TARGET_CPU_FREQ)) { + throttled = true; + fprintf(stderr, "Thermal: Throttling activated\n"); + } else { + fprintf(stderr, "Thermal: Failed (try sudo)\n"); + } } - // 55도 이하로 내려가면 throttle 해제 + // 55도 이하면 해제 else if (temp_mc < (TEMP_THRESHOLD_MC - 5000) && throttled) { throttled = false; - - // 🔥 콘솔 출력 - fprintf(stderr, "Thermal: GPU temp %.1f°C < 55°C, releasing throttle\n", - temp_celsius); - - // 🔥 CSV에 기록 - log_thermal_event("RELEASE", temp_celsius, 0); - - // 필요시 원래 frequency로 복구 - // set_cpu_freq_cached(cpu_freq_fds, num_cpu_fds, ORIGINAL_FREQ); + fprintf(stderr, "Thermal: GPU %.1f°C < 55°C, releasing throttle\n", + temp_mc / 1000.0); } } From 93517f0df1470024d83be9b8cfdd05929c3f6183 Mon Sep 17 00:00:00 2001 From: Aiden Date: Tue, 28 Oct 2025 00:05:50 -0400 Subject: [PATCH 06/14] fixed --- src/thermal_control.h | 128 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 116 insertions(+), 12 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index 39fa4ed6d49cf..09bf3e896ce46 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -7,7 +7,10 @@ #include #include #include +#include #include +#include +#include #include #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" @@ -15,6 +18,9 @@ #define TARGET_CPU_FREQ 960000 #define CHECK_INTERVAL 10 // 10 토큰마다 한 번 체크 +// 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) +extern std::ofstream g_csv; + // GPU 온도 읽기 (millidegree C) static inline int read_gpu_temp() { int fd = open(GPU_TEMP_PATH, O_RDONLY); @@ -83,17 +89,108 @@ static inline bool set_cpu_freq(int freq_khz) { return success; } -// 온도 기반 thermal control +// GPU 온도 읽기 (millidegree C) - FD 재사용 +static inline int read_gpu_temp_fast(int fd) { + if (fd < 0) return -1; + + if (lseek(fd, 0, SEEK_SET) < 0) return -1; + + char buf[32]; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + if (n <= 0) return -1; + buf[n] = '\0'; + + // Fast parse + int temp = 0; + for (int i = 0; i < n && buf[i] >= '0' && buf[i] <= '9'; i++) { + temp = temp * 10 + (buf[i] - '0'); + } + return temp; +} + +// CPU frequency 설정 - 캐시된 FD 사용 +static inline bool set_cpu_freq_cached(int *fd_cache, int num_fds, int freq_khz) { + if (num_fds == 0) return false; + + char freq_str[16]; + int len = snprintf(freq_str, sizeof(freq_str), "%d", freq_khz); + + bool success = true; + for (int i = 0; i < num_fds; i += 2) { + // max 먼저 + if (fd_cache[i + 1] >= 0) { + if (lseek(fd_cache[i + 1], 0, SEEK_SET) >= 0) { + write(fd_cache[i + 1], freq_str, len); + } else { + success = false; + } + } + // min 나중에 + if (fd_cache[i] >= 0) { + if (lseek(fd_cache[i], 0, SEEK_SET) >= 0) { + write(fd_cache[i], freq_str, len); + } else { + success = false; + } + } + } + + return success; +} + +// 초기화: CPU FD들을 미리 열어두기 +static inline int init_cpu_freq_fds(int *fd_cache, int max_fds) { + DIR *dir = opendir("/sys/devices/system/cpu"); + if (!dir) return 0; + + int count = 0; + struct dirent *entry; + + while ((entry = readdir(dir)) != NULL && count < max_fds - 1) { + if (strncmp(entry->d_name, "cpu", 3) != 0) continue; + if (entry->d_name[3] < '0' || entry->d_name[3] > '9') continue; + + char min_path[256], max_path[256]; + snprintf(min_path, sizeof(min_path), + "/sys/devices/system/cpu/%s/cpufreq/scaling_min_freq", + entry->d_name); + snprintf(max_path, sizeof(max_path), + "/sys/devices/system/cpu/%s/cpufreq/scaling_max_freq", + entry->d_name); + + fd_cache[count++] = open(min_path, O_WRONLY); + fd_cache[count++] = open(max_path, O_WRONLY); + } + + closedir(dir); + return count; +} + +// 🔥 CSV에 thermal 이벤트 기록 +static inline void log_thermal_event(const char* event, double temp_celsius, int freq_khz = 0) { + if (!g_csv.is_open()) return; + + auto ts = std::chrono::system_clock::now().time_since_epoch(); + auto ts_ms = std::chrono::duration_cast(ts).count(); + + // CSV 형식: timestamp,-1,event_type,temp,freq + g_csv << ts_ms << ",-1," << event << "," << temp_celsius << "," << freq_khz << "\n"; + g_csv.flush(); +} + +// 온도 기반 thermal control - read_gpu_temp()와 set_cpu_freq() 사용 static inline void thermal_control_check() { static bool initialized = false; static int call_count = 0; static bool throttled = false; + // 초기화 if (!initialized) { initialized = true; fprintf(stderr, "Thermal: Control initialized\n"); } + // CHECK_INTERVAL 토큰마다만 체크 call_count++; if (call_count % CHECK_INTERVAL != 0) { return; @@ -102,23 +199,30 @@ static inline void thermal_control_check() { int temp_mc = read_gpu_temp(); if (temp_mc < 0) return; + double temp_celsius = temp_mc / 1000.0; + // 60도 이상이면 throttle if (temp_mc >= TEMP_THRESHOLD_MC && !throttled) { - fprintf(stderr, "Thermal: GPU %.1f°C >= 60°C, throttling CPU to %d KHz\n", - temp_mc / 1000.0, TARGET_CPU_FREQ); + set_cpu_freq(TARGET_CPU_FREQ); + throttled = true; - if (set_cpu_freq(TARGET_CPU_FREQ)) { - throttled = true; - fprintf(stderr, "Thermal: Throttling activated\n"); - } else { - fprintf(stderr, "Thermal: Failed (try sudo)\n"); - } + // 🔥 콘솔 출력 + fprintf(stderr, "Thermal: GPU temp %.1f°C >= 60°C, throttling CPU to %d KHz\n", + temp_celsius, TARGET_CPU_FREQ); + + // 🔥 CSV에 기록 + log_thermal_event("THROTTLE", temp_celsius, TARGET_CPU_FREQ); } - // 55도 이하면 해제 + // 55도 이하로 내려가면 throttle 해제 else if (temp_mc < (TEMP_THRESHOLD_MC - 5000) && throttled) { throttled = false; - fprintf(stderr, "Thermal: GPU %.1f°C < 55°C, releasing throttle\n", - temp_mc / 1000.0); + + // 🔥 콘솔 출력 + fprintf(stderr, "Thermal: GPU temp %.1f°C < 55°C, releasing throttle\n", + temp_celsius); + + // 🔥 CSV에 기록 + log_thermal_event("RELEASE", temp_celsius, 0); } } From ae845c64ca44654e2951b627ce3e953e84dd58ad Mon Sep 17 00:00:00 2001 From: Aiden Date: Tue, 28 Oct 2025 01:27:25 -0400 Subject: [PATCH 07/14] using gpu frequency - works better --- src/thermal_control.h | 115 +++++++++++++++++++++++++++++------------- 1 file changed, 81 insertions(+), 34 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index 09bf3e896ce46..33cb0dd968cd5 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -10,13 +10,20 @@ #include #include #include -#include +#include #include +#include #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" -#define TEMP_THRESHOLD_MC 60000 // 60도 = 60000 millidegree C -#define TARGET_CPU_FREQ 960000 -#define CHECK_INTERVAL 10 // 10 토큰마다 한 번 체크 +#define GPU_MIN_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/min_freq" +#define GPU_MAX_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/max_freq" +#define CHECK_INTERVAL 2 // 10 토큰마다 한 번 체크 + +// 온도-주파수 매핑 (온도(°C) -> GPU frequency(Hz)) +static std::map temp_to_freq = { + {60, 443000000}, + {70, 660000000} +}; // 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) extern std::ofstream g_csv; @@ -41,7 +48,37 @@ static inline int read_gpu_temp() { return temp; } -// CPU frequency 설정 - echo처럼 +// GPU frequency 설정 - echo처럼 +static inline bool set_gpu_freq(int freq_hz) { + char freq_str[32]; + snprintf(freq_str, sizeof(freq_str), "%d\n", freq_hz); + + bool success = false; + + // min_freq 설정 + int fd_min = open(GPU_MIN_FREQ_PATH, O_WRONLY | O_TRUNC); + if (fd_min >= 0) { + write(fd_min, freq_str, strlen(freq_str)); + close(fd_min); + success = true; + } else { + fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MIN_FREQ_PATH, strerror(errno)); + } + + // max_freq 설정 + int fd_max = open(GPU_MAX_FREQ_PATH, O_WRONLY | O_TRUNC); + if (fd_max >= 0) { + write(fd_max, freq_str, strlen(freq_str)); + close(fd_max); + } else { + fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MAX_FREQ_PATH, strerror(errno)); + } + + return success; +} + +// CPU frequency 설정 - echo처럼 (주석 처리) +/* static inline bool set_cpu_freq(int freq_khz) { DIR *dir = opendir("/sys/devices/system/cpu"); if (!dir) { @@ -88,6 +125,7 @@ static inline bool set_cpu_freq(int freq_khz) { closedir(dir); return success; } +*/ // GPU 온도 읽기 (millidegree C) - FD 재사용 static inline int read_gpu_temp_fast(int fd) { @@ -167,22 +205,35 @@ static inline int init_cpu_freq_fds(int *fd_cache, int max_fds) { } // 🔥 CSV에 thermal 이벤트 기록 -static inline void log_thermal_event(const char* event, double temp_celsius, int freq_khz = 0) { +static inline void log_thermal_event(const char* event, double temp_celsius, int freq_hz = 0) { if (!g_csv.is_open()) return; auto ts = std::chrono::system_clock::now().time_since_epoch(); auto ts_ms = std::chrono::duration_cast(ts).count(); // CSV 형식: timestamp,-1,event_type,temp,freq - g_csv << ts_ms << ",-1," << event << "," << temp_celsius << "," << freq_khz << "\n"; + g_csv << ts_ms << ",-1," << event << "," << temp_celsius << "," << freq_hz << "\n"; g_csv.flush(); } -// 온도 기반 thermal control - read_gpu_temp()와 set_cpu_freq() 사용 +// 온도에 맞는 GPU frequency 찾기 +static inline int get_freq_for_temp(int temp_celsius) { + // 온도가 높을수록 낮은 주파수 사용 + // 매핑된 온도 중 현재 온도 이상인 가장 낮은 온도 찾기 + int target_freq = 0; + for (auto& pair : temp_to_freq) { + if (temp_celsius >= pair.first) { + target_freq = pair.second; + } + } + return target_freq; +} + +// 온도 기반 thermal control static inline void thermal_control_check() { static bool initialized = false; static int call_count = 0; - static bool throttled = false; + static int current_freq = 0; // 초기화 if (!initialized) { @@ -196,33 +247,29 @@ static inline void thermal_control_check() { return; } - int temp_mc = read_gpu_temp(); - if (temp_mc < 0) return; + // int temp_mc = read_gpu_temp(); + // if (temp_mc < 0) return; - double temp_celsius = temp_mc / 1000.0; + // int temp_celsius = temp_mc / 1000; + int temp_celsius = 60; - // 60도 이상이면 throttle - if (temp_mc >= TEMP_THRESHOLD_MC && !throttled) { - set_cpu_freq(TARGET_CPU_FREQ); - throttled = true; - - // 🔥 콘솔 출력 - fprintf(stderr, "Thermal: GPU temp %.1f°C >= 60°C, throttling CPU to %d KHz\n", - temp_celsius, TARGET_CPU_FREQ); - - // 🔥 CSV에 기록 - log_thermal_event("THROTTLE", temp_celsius, TARGET_CPU_FREQ); - } - // 55도 이하로 내려가면 throttle 해제 - else if (temp_mc < (TEMP_THRESHOLD_MC - 5000) && throttled) { - throttled = false; - - // 🔥 콘솔 출력 - fprintf(stderr, "Thermal: GPU temp %.1f°C < 55°C, releasing throttle\n", - temp_celsius); - - // 🔥 CSV에 기록 - log_thermal_event("RELEASE", temp_celsius, 0); + // 온도에 맞는 주파수 찾기 + int target_freq = get_freq_for_temp(temp_celsius); + + // 주파수가 바뀌어야 할 때만 설정 + if (target_freq > 0 && target_freq != current_freq) { + if (set_gpu_freq(target_freq)) { + current_freq = target_freq; + + // 🔥 콘솔 출력 + fprintf(stderr, "Thermal: wants to set GPU temp to be %d°C, setting GPU freq to %d Hz\n", + temp_celsius, target_freq); + + // 🔥 CSV에 기록 + log_thermal_event("FREQ_CHANGE", (double)temp_celsius, target_freq); + } else { + fprintf(stderr, "Thermal: Failed to set GPU frequency (try sudo)\n"); + } } } From 417644729813ca376c685972eb0796ad1853d0c3 Mon Sep 17 00:00:00 2001 From: Aiden Date: Tue, 28 Oct 2025 02:04:55 -0400 Subject: [PATCH 08/14] temperature fixed --- src/thermal_control.h | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index 33cb0dd968cd5..cb6a832b0065f 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -17,7 +17,9 @@ #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" #define GPU_MIN_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/min_freq" #define GPU_MAX_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/max_freq" -#define CHECK_INTERVAL 2 // 10 토큰마다 한 번 체크 +// #define CHECK_INTERVAL 5 // 10 토큰마다 한 번 체크 + +#define TARGET_TEMPERATURE 60 // 온도-주파수 매핑 (온도(°C) -> GPU frequency(Hz)) static std::map temp_to_freq = { @@ -232,6 +234,7 @@ static inline int get_freq_for_temp(int temp_celsius) { // 온도 기반 thermal control static inline void thermal_control_check() { static bool initialized = false; + static bool finished = false; static int call_count = 0; static int current_freq = 0; @@ -241,35 +244,31 @@ static inline void thermal_control_check() { fprintf(stderr, "Thermal: Control initialized\n"); } - // CHECK_INTERVAL 토큰마다만 체크 - call_count++; - if (call_count % CHECK_INTERVAL != 0) { - return; - } - - // int temp_mc = read_gpu_temp(); - // if (temp_mc < 0) return; - - // int temp_celsius = temp_mc / 1000; - int temp_celsius = 60; + int temp_mc = read_gpu_temp(); + if (temp_mc < 0) return; - // 온도에 맞는 주파수 찾기 - int target_freq = get_freq_for_temp(temp_celsius); + int temp_celsius = temp_mc / 1000; + + if (temp_celsius >= temp_celsius && !finished){ + finished = true; - // 주파수가 바뀌어야 할 때만 설정 - if (target_freq > 0 && target_freq != current_freq) { + // 온도에 맞는 주파수 찾기 + int target_freq = get_freq_for_temp(temp_celsius); + + // 주파수가 바뀌어야 할 때만 설정 if (set_gpu_freq(target_freq)) { current_freq = target_freq; - // 🔥 콘솔 출력 + // 콘솔 출력 fprintf(stderr, "Thermal: wants to set GPU temp to be %d°C, setting GPU freq to %d Hz\n", temp_celsius, target_freq); - // 🔥 CSV에 기록 - log_thermal_event("FREQ_CHANGE", (double)temp_celsius, target_freq); + // CSV에 기록 + // log_thermal_event("FREQ_CHANGE", (double)temp_celsius, target_freq); } else { fprintf(stderr, "Thermal: Failed to set GPU frequency (try sudo)\n"); } + } } From b450abf171598053cdcb8027dac6e6d78365192a Mon Sep 17 00:00:00 2001 From: Aiden Date: Tue, 28 Oct 2025 21:07:07 -0400 Subject: [PATCH 09/14] temp added --- src/thermal_control.h | 56 +++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index cb6a832b0065f..2771ae6defecc 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -30,24 +30,24 @@ static std::map temp_to_freq = { // 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) extern std::ofstream g_csv; -// GPU 온도 읽기 (millidegree C) -static inline int read_gpu_temp() { - int fd = open(GPU_TEMP_PATH, O_RDONLY); - if (fd < 0) return -1; - - char buf[32]; - ssize_t n = read(fd, buf, sizeof(buf) - 1); - close(fd); - - if (n <= 0) return -1; - buf[n] = '\0'; - - // Fast parse - int temp = 0; - for (int i = 0; i < n && buf[i] >= '0' && buf[i] <= '9'; i++) { - temp = temp * 10 + (buf[i] - '0'); + +// Read GPU Temperature + +double read_gpu_temp() { + const std::string path = "/sys/class/kgsl/kgsl-3d0/temp"; + std::ifstream file(path); + + if (!file.is_open()) { + fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MIN_FREQ_PATH, strerror(errno)); + return -1.0; // 오류 시 음수 반환 } - return temp; + + int temp_milli = 0; + file >> temp_milli; + file.close(); + + // 밀리도 단위 → 섭씨 + return temp_milli; } // GPU frequency 설정 - echo처럼 @@ -129,24 +129,6 @@ static inline bool set_cpu_freq(int freq_khz) { } */ -// GPU 온도 읽기 (millidegree C) - FD 재사용 -static inline int read_gpu_temp_fast(int fd) { - if (fd < 0) return -1; - - if (lseek(fd, 0, SEEK_SET) < 0) return -1; - - char buf[32]; - ssize_t n = read(fd, buf, sizeof(buf) - 1); - if (n <= 0) return -1; - buf[n] = '\0'; - - // Fast parse - int temp = 0; - for (int i = 0; i < n && buf[i] >= '0' && buf[i] <= '9'; i++) { - temp = temp * 10 + (buf[i] - '0'); - } - return temp; -} // CPU frequency 설정 - 캐시된 FD 사용 static inline bool set_cpu_freq_cached(int *fd_cache, int num_fds, int freq_khz) { @@ -245,11 +227,11 @@ static inline void thermal_control_check() { } int temp_mc = read_gpu_temp(); - if (temp_mc < 0) return; + if (temp_mc <= 0) return; int temp_celsius = temp_mc / 1000; - if (temp_celsius >= temp_celsius && !finished){ + if (temp_celsius >= TARGET_TEMPERATURE && !finished){ finished = true; // 온도에 맞는 주파수 찾기 From ae0fb6fb1c2ce10e037597cdee19dcae107806d1 Mon Sep 17 00:00:00 2001 From: Aiden Date: Tue, 28 Oct 2025 21:45:14 -0400 Subject: [PATCH 10/14] quick fix --- src/thermal_control.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index 2771ae6defecc..7393397cd51e1 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -24,7 +24,7 @@ // 온도-주파수 매핑 (온도(°C) -> GPU frequency(Hz)) static std::map temp_to_freq = { {60, 443000000}, - {70, 660000000} + {65, 660000000} }; // 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) From e01da77bb15a0af176d13c7b719504324dfca181 Mon Sep 17 00:00:00 2001 From: Aiden Date: Sun, 2 Nov 2025 00:08:05 -0400 Subject: [PATCH 11/14] small change --- src/thermal_control.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index 7393397cd51e1..a37c4c4ddd2c2 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -235,7 +235,8 @@ static inline void thermal_control_check() { finished = true; // 온도에 맞는 주파수 찾기 - int target_freq = get_freq_for_temp(temp_celsius); + int target_freq = get_freq_for_temp(TARGET_TEMPERATURE); + // int target_freq = TARGET_TEMPERATURE; // 주파수가 바뀌어야 할 때만 설정 if (set_gpu_freq(target_freq)) { From 28466b11627af414d6dd42614725c5e2448f5b55 Mon Sep 17 00:00:00 2001 From: Aiden Date: Fri, 7 Nov 2025 15:37:39 -0500 Subject: [PATCH 12/14] focusing on gpu --- src/thermal_control.h | 109 ------------------------------------------ 1 file changed, 109 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index a37c4c4ddd2c2..91e79e2219c6e 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -79,115 +79,6 @@ static inline bool set_gpu_freq(int freq_hz) { return success; } -// CPU frequency 설정 - echo처럼 (주석 처리) -/* -static inline bool set_cpu_freq(int freq_khz) { - DIR *dir = opendir("/sys/devices/system/cpu"); - if (!dir) { - fprintf(stderr, "Thermal: Cannot open /sys/devices/system/cpu: %s\n", strerror(errno)); - return false; - } - - char freq_str[32]; - snprintf(freq_str, sizeof(freq_str), "%d\n", freq_khz); - - bool success = false; - struct dirent *entry; - - while ((entry = readdir(dir)) != NULL) { - // cpu0, cpu1, ... 찾기 - if (strncmp(entry->d_name, "cpu", 3) != 0) continue; - char c = entry->d_name[3]; - if (c < '0' || c > '9') continue; - - char path[256]; - - // scaling_max_freq 먼저 (min보다 크거나 같아야 함) - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/%s/cpufreq/scaling_max_freq", - entry->d_name); - int fd = open(path, O_WRONLY | O_TRUNC); - if (fd >= 0) { - write(fd, freq_str, strlen(freq_str)); - close(fd); - success = true; - } - - // scaling_min_freq - snprintf(path, sizeof(path), - "/sys/devices/system/cpu/%s/cpufreq/scaling_min_freq", - entry->d_name); - fd = open(path, O_WRONLY | O_TRUNC); - if (fd >= 0) { - write(fd, freq_str, strlen(freq_str)); - close(fd); - } - } - - closedir(dir); - return success; -} -*/ - - -// CPU frequency 설정 - 캐시된 FD 사용 -static inline bool set_cpu_freq_cached(int *fd_cache, int num_fds, int freq_khz) { - if (num_fds == 0) return false; - - char freq_str[16]; - int len = snprintf(freq_str, sizeof(freq_str), "%d", freq_khz); - - bool success = true; - for (int i = 0; i < num_fds; i += 2) { - // max 먼저 - if (fd_cache[i + 1] >= 0) { - if (lseek(fd_cache[i + 1], 0, SEEK_SET) >= 0) { - write(fd_cache[i + 1], freq_str, len); - } else { - success = false; - } - } - // min 나중에 - if (fd_cache[i] >= 0) { - if (lseek(fd_cache[i], 0, SEEK_SET) >= 0) { - write(fd_cache[i], freq_str, len); - } else { - success = false; - } - } - } - - return success; -} - -// 초기화: CPU FD들을 미리 열어두기 -static inline int init_cpu_freq_fds(int *fd_cache, int max_fds) { - DIR *dir = opendir("/sys/devices/system/cpu"); - if (!dir) return 0; - - int count = 0; - struct dirent *entry; - - while ((entry = readdir(dir)) != NULL && count < max_fds - 1) { - if (strncmp(entry->d_name, "cpu", 3) != 0) continue; - if (entry->d_name[3] < '0' || entry->d_name[3] > '9') continue; - - char min_path[256], max_path[256]; - snprintf(min_path, sizeof(min_path), - "/sys/devices/system/cpu/%s/cpufreq/scaling_min_freq", - entry->d_name); - snprintf(max_path, sizeof(max_path), - "/sys/devices/system/cpu/%s/cpufreq/scaling_max_freq", - entry->d_name); - - fd_cache[count++] = open(min_path, O_WRONLY); - fd_cache[count++] = open(max_path, O_WRONLY); - } - - closedir(dir); - return count; -} - // 🔥 CSV에 thermal 이벤트 기록 static inline void log_thermal_event(const char* event, double temp_celsius, int freq_hz = 0) { if (!g_csv.is_open()) return; From 5cd501d97e4ccaf30855de8f293788067b6f039e Mon Sep 17 00:00:00 2001 From: Aiden Date: Fri, 7 Nov 2025 20:36:52 -0500 Subject: [PATCH 13/14] PID controller added --- src/thermal_control.h | 380 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 328 insertions(+), 52 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index 91e79e2219c6e..e753bad9fb295 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -1,6 +1,157 @@ -// thermal_control.h -#ifndef THERMAL_CONTROL_H -#define THERMAL_CONTROL_H +// // thermal_control.h +// #ifndef THERMAL_CONTROL_H +// #define THERMAL_CONTROL_H + +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include +// #include + +// #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" +// #define GPU_MIN_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/min_freq" +// #define GPU_MAX_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/max_freq" +// // #define CHECK_INTERVAL 5 // 10 토큰마다 한 번 체크 + +// #define TARGET_TEMPERATURE 60 + +// // 온도-주파수 매핑 (온도(°C) -> GPU frequency(Hz)) +// static std::map temp_to_freq = { +// {60, 443000000}, +// {65, 660000000} +// }; + +// // 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) +// extern std::ofstream g_csv; + + +// // Read GPU Temperature + +// double read_gpu_temp() { +// const std::string path = "/sys/class/kgsl/kgsl-3d0/temp"; +// std::ifstream file(path); + +// if (!file.is_open()) { +// fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MIN_FREQ_PATH, strerror(errno)); +// return -1.0; // 오류 시 음수 반환 +// } + +// int temp_milli = 0; +// file >> temp_milli; +// file.close(); + +// // 밀리도 단위 → 섭씨 +// return temp_milli; +// } + +// // GPU frequency 설정 - echo처럼 +// static inline bool set_gpu_freq(int freq_hz) { +// char freq_str[32]; +// snprintf(freq_str, sizeof(freq_str), "%d\n", freq_hz); + +// bool success = false; + +// // min_freq 설정 +// int fd_min = open(GPU_MIN_FREQ_PATH, O_WRONLY | O_TRUNC); +// if (fd_min >= 0) { +// write(fd_min, freq_str, strlen(freq_str)); +// close(fd_min); +// success = true; +// } else { +// fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MIN_FREQ_PATH, strerror(errno)); +// } + +// // max_freq 설정 +// int fd_max = open(GPU_MAX_FREQ_PATH, O_WRONLY | O_TRUNC); +// if (fd_max >= 0) { +// write(fd_max, freq_str, strlen(freq_str)); +// close(fd_max); +// } else { +// fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MAX_FREQ_PATH, strerror(errno)); +// } + +// return success; +// } + +// // 🔥 CSV에 thermal 이벤트 기록 +// static inline void log_thermal_event(const char* event, double temp_celsius, int freq_hz = 0) { +// if (!g_csv.is_open()) return; + +// auto ts = std::chrono::system_clock::now().time_since_epoch(); +// auto ts_ms = std::chrono::duration_cast(ts).count(); + +// // CSV 형식: timestamp,-1,event_type,temp,freq +// g_csv << ts_ms << ",-1," << event << "," << temp_celsius << "," << freq_hz << "\n"; +// g_csv.flush(); +// } + +// // 온도에 맞는 GPU frequency 찾기 +// static inline int get_freq_for_temp(int temp_celsius) { +// // 온도가 높을수록 낮은 주파수 사용 +// // 매핑된 온도 중 현재 온도 이상인 가장 낮은 온도 찾기 +// int target_freq = 0; +// for (auto& pair : temp_to_freq) { +// if (temp_celsius >= pair.first) { +// target_freq = pair.second; +// } +// } +// return target_freq; +// } + +// // 온도 기반 thermal control +// static inline void thermal_control_check() { +// static bool initialized = false; +// static bool finished = false; +// static int call_count = 0; +// static int current_freq = 0; + +// // 초기화 +// if (!initialized) { +// initialized = true; +// fprintf(stderr, "Thermal: Control initialized\n"); +// } + +// int temp_mc = read_gpu_temp(); +// if (temp_mc <= 0) return; + +// int temp_celsius = temp_mc / 1000; + +// if (temp_celsius >= TARGET_TEMPERATURE && !finished){ +// finished = true; + +// // 온도에 맞는 주파수 찾기 +// int target_freq = get_freq_for_temp(TARGET_TEMPERATURE); +// // int target_freq = TARGET_TEMPERATURE; + +// // 주파수가 바뀌어야 할 때만 설정 +// if (set_gpu_freq(target_freq)) { +// current_freq = target_freq; + +// // 콘솔 출력 +// fprintf(stderr, "Thermal: wants to set GPU temp to be %d°C, setting GPU freq to %d Hz\n", +// temp_celsius, target_freq); + +// // CSV에 기록 +// // log_thermal_event("FREQ_CHANGE", (double)temp_celsius, target_freq); +// } else { +// fprintf(stderr, "Thermal: Failed to set GPU frequency (try sudo)\n"); +// } + +// } +// } + +// #endif // THERMAL_CONTROL_H + + +// thermal_control_pid.h +#ifndef THERMAL_CONTROL_PID_H +#define THERMAL_CONTROL_PID_H #include #include @@ -12,45 +163,71 @@ #include #include #include -#include +#include +#include #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" #define GPU_MIN_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/min_freq" #define GPU_MAX_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/max_freq" -// #define CHECK_INTERVAL 5 // 10 토큰마다 한 번 체크 -#define TARGET_TEMPERATURE 60 +// PID 설정 +#define TARGET_TEMPERATURE 60.0 // 목표 온도 (°C) +#define CHECK_INTERVAL_MS 500 // 체크 간격 (ms) - 더 자주 체크할수록 더 정확함 -// 온도-주파수 매핑 (온도(°C) -> GPU frequency(Hz)) -static std::map temp_to_freq = { - {60, 443000000}, - {65, 660000000} -}; +// PID 게인 (튜닝 필요) +#define KP 0.8 // Proportional gain +#define KI 0.1 // Integral gain +#define KD 0.5 // Derivative gain // 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) extern std::ofstream g_csv; +// Snapdragon Elite GPU 주파수 레벨 (Hz) +static const int GPU_FREQ_LEVELS[] = { + 900000000, // 0 + 832000000, // 1 + 734000000, // 2 + 660000000, // 3 + 607000000, // 4 + 525000000, // 5 + 443000000, // 6 + 389000000, // 7 + 342000000, // 8 + 222000000, // 9 + 160000000 // 10 +}; +static const int NUM_FREQ_LEVELS = sizeof(GPU_FREQ_LEVELS) / sizeof(GPU_FREQ_LEVELS[0]); -// Read GPU Temperature +// PID 상태 구조체 +struct PIDState { + double integral; // 오차 적분값 + double prev_error; // 이전 오차 + int current_freq_index; // 현재 주파수 인덱스 + bool active; // PID 활성화 여부 + std::chrono::steady_clock::time_point last_check; +}; -double read_gpu_temp() { - const std::string path = "/sys/class/kgsl/kgsl-3d0/temp"; +static PIDState g_pid_state = {0.0, 0.0, 0, false}; + +// Read GPU Temperature +static inline double read_gpu_temp() { + const std::string path = GPU_TEMP_PATH; std::ifstream file(path); if (!file.is_open()) { - fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MIN_FREQ_PATH, strerror(errno)); - return -1.0; // 오류 시 음수 반환 + fprintf(stderr, "Thermal: Cannot open %s: %s\n", path.c_str(), strerror(errno)); + return -1.0; } int temp_milli = 0; file >> temp_milli; file.close(); - // 밀리도 단위 → 섭씨 - return temp_milli; + // 밀리도 → 섭씨 + return temp_milli / 1000.0; } -// GPU frequency 설정 - echo처럼 +// GPU frequency 설정 static inline bool set_gpu_freq(int freq_hz) { char freq_str[32]; snprintf(freq_str, sizeof(freq_str), "%d\n", freq_hz); @@ -91,59 +268,158 @@ static inline void log_thermal_event(const char* event, double temp_celsius, int g_csv.flush(); } -// 온도에 맞는 GPU frequency 찾기 -static inline int get_freq_for_temp(int temp_celsius) { - // 온도가 높을수록 낮은 주파수 사용 - // 매핑된 온도 중 현재 온도 이상인 가장 낮은 온도 찾기 - int target_freq = 0; - for (auto& pair : temp_to_freq) { - if (temp_celsius >= pair.first) { - target_freq = pair.second; - } +// PID 계산 함수 +static inline int compute_pid(double current_temp, double dt) { + // 오차 계산 (목표 - 현재) + // 양수면 온도가 낮음 → 주파수 올려야 함 + // 음수면 온도가 높음 → 주파수 낮춰야 함 + double error = TARGET_TEMPERATURE - current_temp; + + // P: 비례 항 (현재 오차) + double p_term = KP * error; + + // I: 적분 항 (누적 오차) + g_pid_state.integral += error * dt; + + // Anti-windup: integral 값 제한 (너무 커지는 것 방지) + const double INTEGRAL_MAX = 50.0; + g_pid_state.integral = std::max(-INTEGRAL_MAX, std::min(INTEGRAL_MAX, g_pid_state.integral)); + double i_term = KI * g_pid_state.integral; + + // D: 미분 항 (오차 변화율) + double derivative = (error - g_pid_state.prev_error) / dt; + double d_term = KD * derivative; + + // PID 출력 계산 + double pid_output = p_term + i_term + d_term; + + // 이전 오차 저장 + g_pid_state.prev_error = error; + + // PID 출력을 주파수 인덱스 변화량으로 변환 + // 양수 → 주파수 올림 (인덱스 감소) + // 음수 → 주파수 낮춤 (인덱스 증가) + int index_delta = 0; + + if (pid_output > 2.0) { + index_delta = -2; // 빠르게 주파수 올림 + } else if (pid_output > 0.5) { + index_delta = -1; // 주파수 올림 + } else if (pid_output < -2.0) { + index_delta = 2; // 빠르게 주파수 낮춤 + } else if (pid_output < -0.5) { + index_delta = 1; // 주파수 낮춤 } - return target_freq; + // -0.5 ~ 0.5 사이면 유지 (dead zone) + + return index_delta; } -// 온도 기반 thermal control +// PID 기반 thermal control static inline void thermal_control_check() { static bool initialized = false; - static bool finished = false; - static int call_count = 0; - static int current_freq = 0; // 초기화 if (!initialized) { initialized = true; - fprintf(stderr, "Thermal: Control initialized\n"); + g_pid_state.current_freq_index = 0; // 최대 주파수에서 시작 + g_pid_state.last_check = std::chrono::steady_clock::now(); + fprintf(stderr, "Thermal: PID Controller initialized\n"); + fprintf(stderr, "Thermal: Target temperature: %.1f°C\n", TARGET_TEMPERATURE); + fprintf(stderr, "Thermal: PID gains - Kp:%.2f Ki:%.2f Kd:%.2f\n", KP, KI, KD); } - int temp_mc = read_gpu_temp(); - if (temp_mc <= 0) return; + // 현재 시간 + auto now = std::chrono::steady_clock::now(); + auto elapsed = std::chrono::duration_cast(now - g_pid_state.last_check); - int temp_celsius = temp_mc / 1000; - - if (temp_celsius >= TARGET_TEMPERATURE && !finished){ - finished = true; + // 체크 간격이 안 됐으면 리턴 + if (elapsed.count() < CHECK_INTERVAL_MS) { + return; + } + + // 온도 읽기 + double temp = read_gpu_temp(); + if (temp <= 0) return; + + // PID 활성화 조건: 목표 온도 근처 또는 이미 활성화됨 + if (!g_pid_state.active && temp >= TARGET_TEMPERATURE - 5.0) { + g_pid_state.active = true; + fprintf(stderr, "Thermal: PID activated at %.1f°C\n", temp); + log_thermal_event("PID_ACTIVATED", temp); + } + + if (!g_pid_state.active) { + g_pid_state.last_check = now; + return; // 아직 PID 작동 안 함 + } + + // dt 계산 (초 단위) + double dt = elapsed.count() / 1000.0; + + // PID 계산 + int index_delta = compute_pid(temp, dt); + + // 주파수 인덱스 업데이트 + int new_index = g_pid_state.current_freq_index + index_delta; + new_index = std::max(0, std::min(NUM_FREQ_LEVELS - 1, new_index)); - // 온도에 맞는 주파수 찾기 - int target_freq = get_freq_for_temp(TARGET_TEMPERATURE); - // int target_freq = TARGET_TEMPERATURE; + // 주파수가 바뀌어야 하면 설정 + if (new_index != g_pid_state.current_freq_index) { + int new_freq = GPU_FREQ_LEVELS[new_index]; - // 주파수가 바뀌어야 할 때만 설정 - if (set_gpu_freq(target_freq)) { - current_freq = target_freq; + if (set_gpu_freq(new_freq)) { + // 디버그 출력 + double error = TARGET_TEMPERATURE - temp; + fprintf(stderr, "Thermal: T=%.1f°C (error=%.1f) → Freq[%d->%d] = %d Hz (%.0f MHz)\n", + temp, error, + g_pid_state.current_freq_index, new_index, + new_freq, new_freq / 1e6); - // 콘솔 출력 - fprintf(stderr, "Thermal: wants to set GPU temp to be %d°C, setting GPU freq to %d Hz\n", - temp_celsius, target_freq); + g_pid_state.current_freq_index = new_index; - // CSV에 기록 - // log_thermal_event("FREQ_CHANGE", (double)temp_celsius, target_freq); + // CSV 기록 + log_thermal_event("FREQ_CHANGE", temp, new_freq); } else { fprintf(stderr, "Thermal: Failed to set GPU frequency (try sudo)\n"); } + } else { + // 주파수 안 바뀜 (안정 상태) + static int stable_count = 0; + stable_count++; + if (stable_count % 10 == 0) { // 10회마다 한 번 출력 + fprintf(stderr, "Thermal: Stable at T=%.1f°C, Freq=%d MHz\n", + temp, GPU_FREQ_LEVELS[g_pid_state.current_freq_index] / 1000000); + } } + + g_pid_state.last_check = now; +} + +// PID 리셋 (필요시) +static inline void reset_pid() { + g_pid_state.integral = 0.0; + g_pid_state.prev_error = 0.0; + g_pid_state.current_freq_index = 0; + g_pid_state.active = false; + fprintf(stderr, "Thermal: PID reset\n"); +} + +// 현재 PID 상태 출력 (디버깅용) +static inline void print_pid_status() { + double temp = read_gpu_temp(); + int freq = GPU_FREQ_LEVELS[g_pid_state.current_freq_index]; + + fprintf(stderr, "\n=== PID Status ===\n"); + fprintf(stderr, "Current Temp: %.1f°C\n", temp); + fprintf(stderr, "Target Temp: %.1f°C\n", TARGET_TEMPERATURE); + fprintf(stderr, "Error: %.1f°C\n", TARGET_TEMPERATURE - temp); + fprintf(stderr, "Integral: %.2f\n", g_pid_state.integral); + fprintf(stderr, "Current Freq: %d MHz (Level %d/%d)\n", + freq / 1000000, g_pid_state.current_freq_index, NUM_FREQ_LEVELS - 1); + fprintf(stderr, "PID Active: %s\n", g_pid_state.active ? "Yes" : "No"); + fprintf(stderr, "==================\n\n"); } -#endif // THERMAL_CONTROL_H \ No newline at end of file +#endif // THERMAL_CONTROL_PID_H \ No newline at end of file From 761a92962d49459cce5ea86dd773ec9030041b9b Mon Sep 17 00:00:00 2001 From: Aiden Date: Sat, 8 Nov 2025 00:34:48 -0500 Subject: [PATCH 14/14] modified to include pid temp and freq csv --- src/thermal_control.h | 238 ++++++++++++------------------------------ 1 file changed, 67 insertions(+), 171 deletions(-) diff --git a/src/thermal_control.h b/src/thermal_control.h index e753bad9fb295..b5b289358af91 100644 --- a/src/thermal_control.h +++ b/src/thermal_control.h @@ -1,154 +1,3 @@ -// // thermal_control.h -// #ifndef THERMAL_CONTROL_H -// #define THERMAL_CONTROL_H - -// #include -// #include -// #include -// #include -// #include -// #include -// #include -// #include -// #include -// #include -// #include - -// #define GPU_TEMP_PATH "/sys/class/kgsl/kgsl-3d0/temp" -// #define GPU_MIN_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/min_freq" -// #define GPU_MAX_FREQ_PATH "/sys/class/kgsl/kgsl-3d0/devfreq/max_freq" -// // #define CHECK_INTERVAL 5 // 10 토큰마다 한 번 체크 - -// #define TARGET_TEMPERATURE 60 - -// // 온도-주파수 매핑 (온도(°C) -> GPU frequency(Hz)) -// static std::map temp_to_freq = { -// {60, 443000000}, -// {65, 660000000} -// }; - -// // 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) -// extern std::ofstream g_csv; - - -// // Read GPU Temperature - -// double read_gpu_temp() { -// const std::string path = "/sys/class/kgsl/kgsl-3d0/temp"; -// std::ifstream file(path); - -// if (!file.is_open()) { -// fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MIN_FREQ_PATH, strerror(errno)); -// return -1.0; // 오류 시 음수 반환 -// } - -// int temp_milli = 0; -// file >> temp_milli; -// file.close(); - -// // 밀리도 단위 → 섭씨 -// return temp_milli; -// } - -// // GPU frequency 설정 - echo처럼 -// static inline bool set_gpu_freq(int freq_hz) { -// char freq_str[32]; -// snprintf(freq_str, sizeof(freq_str), "%d\n", freq_hz); - -// bool success = false; - -// // min_freq 설정 -// int fd_min = open(GPU_MIN_FREQ_PATH, O_WRONLY | O_TRUNC); -// if (fd_min >= 0) { -// write(fd_min, freq_str, strlen(freq_str)); -// close(fd_min); -// success = true; -// } else { -// fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MIN_FREQ_PATH, strerror(errno)); -// } - -// // max_freq 설정 -// int fd_max = open(GPU_MAX_FREQ_PATH, O_WRONLY | O_TRUNC); -// if (fd_max >= 0) { -// write(fd_max, freq_str, strlen(freq_str)); -// close(fd_max); -// } else { -// fprintf(stderr, "Thermal: Cannot open %s: %s\n", GPU_MAX_FREQ_PATH, strerror(errno)); -// } - -// return success; -// } - -// // 🔥 CSV에 thermal 이벤트 기록 -// static inline void log_thermal_event(const char* event, double temp_celsius, int freq_hz = 0) { -// if (!g_csv.is_open()) return; - -// auto ts = std::chrono::system_clock::now().time_since_epoch(); -// auto ts_ms = std::chrono::duration_cast(ts).count(); - -// // CSV 형식: timestamp,-1,event_type,temp,freq -// g_csv << ts_ms << ",-1," << event << "," << temp_celsius << "," << freq_hz << "\n"; -// g_csv.flush(); -// } - -// // 온도에 맞는 GPU frequency 찾기 -// static inline int get_freq_for_temp(int temp_celsius) { -// // 온도가 높을수록 낮은 주파수 사용 -// // 매핑된 온도 중 현재 온도 이상인 가장 낮은 온도 찾기 -// int target_freq = 0; -// for (auto& pair : temp_to_freq) { -// if (temp_celsius >= pair.first) { -// target_freq = pair.second; -// } -// } -// return target_freq; -// } - -// // 온도 기반 thermal control -// static inline void thermal_control_check() { -// static bool initialized = false; -// static bool finished = false; -// static int call_count = 0; -// static int current_freq = 0; - -// // 초기화 -// if (!initialized) { -// initialized = true; -// fprintf(stderr, "Thermal: Control initialized\n"); -// } - -// int temp_mc = read_gpu_temp(); -// if (temp_mc <= 0) return; - -// int temp_celsius = temp_mc / 1000; - -// if (temp_celsius >= TARGET_TEMPERATURE && !finished){ -// finished = true; - -// // 온도에 맞는 주파수 찾기 -// int target_freq = get_freq_for_temp(TARGET_TEMPERATURE); -// // int target_freq = TARGET_TEMPERATURE; - -// // 주파수가 바뀌어야 할 때만 설정 -// if (set_gpu_freq(target_freq)) { -// current_freq = target_freq; - -// // 콘솔 출력 -// fprintf(stderr, "Thermal: wants to set GPU temp to be %d°C, setting GPU freq to %d Hz\n", -// temp_celsius, target_freq); - -// // CSV에 기록 -// // log_thermal_event("FREQ_CHANGE", (double)temp_celsius, target_freq); -// } else { -// fprintf(stderr, "Thermal: Failed to set GPU frequency (try sudo)\n"); -// } - -// } -// } - -// #endif // THERMAL_CONTROL_H - - // thermal_control_pid.h #ifndef THERMAL_CONTROL_PID_H #define THERMAL_CONTROL_PID_H @@ -179,8 +28,9 @@ #define KI 0.1 // Integral gain #define KD 0.5 // Derivative gain -// 🔥 Throughput monitoring CSV (llama.cpp에서 정의됨) -extern std::ofstream g_csv; +// CSV 로깅용 파일 스트림 +static std::ofstream g_temp_csv; // pid_temp.csv +static std::ofstream g_freq_csv; // pid_freq.csv // Snapdragon Elite GPU 주파수 레벨 (Hz) static const int GPU_FREQ_LEVELS[] = { @@ -256,18 +106,36 @@ static inline bool set_gpu_freq(int freq_hz) { return success; } -// 🔥 CSV에 thermal 이벤트 기록 -static inline void log_thermal_event(const char* event, double temp_celsius, int freq_hz = 0) { - if (!g_csv.is_open()) return; - - auto ts = std::chrono::system_clock::now().time_since_epoch(); - auto ts_ms = std::chrono::duration_cast(ts).count(); +// CSV 파일 초기화 +static inline void init_thermal_csv() { + g_temp_csv.open("pid_temp.csv", std::ios::out | std::ios::trunc); + if (g_temp_csv.is_open()) { + g_temp_csv << "timestamp_sec,temperature_celsius\n"; + g_temp_csv.flush(); + fprintf(stderr, "Thermal: Logging temperature to pid_temp.csv\n"); + } - // CSV 형식: timestamp,-1,event_type,temp,freq - g_csv << ts_ms << ",-1," << event << "," << temp_celsius << "," << freq_hz << "\n"; - g_csv.flush(); + g_freq_csv.open("pid_freq.csv", std::ios::out | std::ios::trunc); + if (g_freq_csv.is_open()) { + g_freq_csv << "timestamp_sec,temperature_celsius,freq_index_from,freq_index_to,frequency_hz,frequency_mhz,error\n"; + g_freq_csv.flush(); + fprintf(stderr, "Thermal: Logging frequency changes to pid_freq.csv\n"); + } } +// CSV 파일 닫기 +static inline void close_thermal_csv() { + if (g_temp_csv.is_open()) { + g_temp_csv.close(); + fprintf(stderr, "Thermal: Closed pid_temp.csv\n"); + } + if (g_freq_csv.is_open()) { + g_freq_csv.close(); + fprintf(stderr, "Thermal: Closed pid_freq.csv\n"); + } +} + + // PID 계산 함수 static inline int compute_pid(double current_temp, double dt) { // 오차 계산 (목표 - 현재) @@ -318,12 +186,20 @@ static inline int compute_pid(double current_temp, double dt) { // PID 기반 thermal control static inline void thermal_control_check() { static bool initialized = false; + static auto start_time = std::chrono::steady_clock::now(); + static auto last_temp_log = std::chrono::steady_clock::now(); // 초기화 if (!initialized) { initialized = true; g_pid_state.current_freq_index = 0; // 최대 주파수에서 시작 g_pid_state.last_check = std::chrono::steady_clock::now(); + start_time = std::chrono::steady_clock::now(); + last_temp_log = std::chrono::steady_clock::now(); + + // CSV 로깅 초기화 + init_thermal_csv(); + fprintf(stderr, "Thermal: PID Controller initialized\n"); fprintf(stderr, "Thermal: Target temperature: %.1f°C\n", TARGET_TEMPERATURE); fprintf(stderr, "Thermal: PID gains - Kp:%.2f Ki:%.2f Kd:%.2f\n", KP, KI, KD); @@ -333,20 +209,30 @@ static inline void thermal_control_check() { auto now = std::chrono::steady_clock::now(); auto elapsed = std::chrono::duration_cast(now - g_pid_state.last_check); + // 온도 읽기 + double temp = read_gpu_temp(); + if (temp <= 0) return; + + // 🔥 매초마다 온도를 pid_temp.csv에 기록 + auto time_since_last_temp_log = std::chrono::duration_cast(now - last_temp_log); + if (time_since_last_temp_log.count() >= 1000) { // 1초마다 + if (g_temp_csv.is_open()) { + auto elapsed_sec = std::chrono::duration_cast(now - start_time).count(); + g_temp_csv << elapsed_sec << "," << temp << "\n"; + g_temp_csv.flush(); + } + last_temp_log = now; + } + // 체크 간격이 안 됐으면 리턴 if (elapsed.count() < CHECK_INTERVAL_MS) { return; } - // 온도 읽기 - double temp = read_gpu_temp(); - if (temp <= 0) return; - // PID 활성화 조건: 목표 온도 근처 또는 이미 활성화됨 if (!g_pid_state.active && temp >= TARGET_TEMPERATURE - 5.0) { g_pid_state.active = true; fprintf(stderr, "Thermal: PID activated at %.1f°C\n", temp); - log_thermal_event("PID_ACTIVATED", temp); } if (!g_pid_state.active) { @@ -376,10 +262,20 @@ static inline void thermal_control_check() { g_pid_state.current_freq_index, new_index, new_freq, new_freq / 1e6); - g_pid_state.current_freq_index = new_index; + // 🔥 주파수 변경을 pid_freq.csv에 기록 + if (g_freq_csv.is_open()) { + auto elapsed_sec = std::chrono::duration_cast(now - start_time).count(); + g_freq_csv << elapsed_sec << "," + << temp << "," + << g_pid_state.current_freq_index << "," + << new_index << "," + << new_freq << "," + << (new_freq / 1000000) << "," + << error << "\n"; + g_freq_csv.flush(); + } - // CSV 기록 - log_thermal_event("FREQ_CHANGE", temp, new_freq); + g_pid_state.current_freq_index = new_index; } else { fprintf(stderr, "Thermal: Failed to set GPU frequency (try sudo)\n"); } @@ -388,7 +284,7 @@ static inline void thermal_control_check() { static int stable_count = 0; stable_count++; - if (stable_count % 10 == 0) { // 10회마다 한 번 출력 + if (stable_count % 20 == 0) { // 20회마다 한 번 출력 fprintf(stderr, "Thermal: Stable at T=%.1f°C, Freq=%d MHz\n", temp, GPU_FREQ_LEVELS[g_pid_state.current_freq_index] / 1000000); }