From 0681e37a264a831136dda500f58e42e7ab988cbc Mon Sep 17 00:00:00 2001 From: Jiping Yin Date: Thu, 9 May 2024 19:00:33 +0800 Subject: [PATCH] [eBPF] Adjust Profiler --- agent/src/ebpf/Makefile | 4 +- agent/src/ebpf/kernel/include/bpf_base.h | 1 + agent/src/ebpf/kernel/include/perf_profiler.h | 3 +- agent/src/ebpf/kernel/perf_profiler.bpf.c | 2 + agent/src/ebpf/mod.rs | 14 + agent/src/ebpf/tools/code.style | 6 + agent/src/ebpf/user/config.h | 46 +- .../src/ebpf/user/profile/extended/extended.c | 32 + .../src/ebpf/user/profile/extended/extended.h | 20 + .../ebpf/user/profile/java/gen_syms_file.c | 10 +- agent/src/ebpf/user/profile/perf_profiler.c | 1304 +++-------------- agent/src/ebpf/user/profile/perf_profiler.h | 46 +- agent/src/ebpf/user/profile/profile_common.c | 1067 ++++++++++++++ agent/src/ebpf/user/profile/profile_common.h | 162 ++ agent/src/ebpf/user/profile/stringifier.c | 32 +- agent/src/ebpf/user/symbol.c | 133 +- agent/src/ebpf/user/symbol.h | 37 +- agent/src/ebpf/user/tracer.c | 38 +- agent/src/ebpf/user/tracer.h | 3 +- server/agent_config/example.yaml | 37 +- 20 files changed, 1769 insertions(+), 1228 deletions(-) create mode 100644 agent/src/ebpf/tools/code.style create mode 100644 agent/src/ebpf/user/profile/extended/extended.c create mode 100644 agent/src/ebpf/user/profile/extended/extended.h create mode 100644 agent/src/ebpf/user/profile/profile_common.c create mode 100644 agent/src/ebpf/user/profile/profile_common.h diff --git a/agent/src/ebpf/Makefile b/agent/src/ebpf/Makefile index 993cd786528..6dace6516cd 100644 --- a/agent/src/ebpf/Makefile +++ b/agent/src/ebpf/Makefile @@ -88,6 +88,8 @@ OBJS := user/elf.o \ user/mem.o \ user/vec.o \ user/bihash.o \ + user/profile/profile_common.o \ + $(patsubst %.c,%.o,$(wildcard user/profile/extended/*.c)) \ user/profile/perf_profiler.o \ user/profile/stringifier.o \ user/profile/java/df_jattach.o \ @@ -175,7 +177,7 @@ user/perf_profiler_bpf_common.c: tools/bintobuffer kernel/perf_profiler.bpf.c $(STATIC_OBJDIR) $(SHARED_OBJDIR): $(call msg,MKDIR,$@) - $(Q)mkdir -p $@/user/profile/java + $(Q)mkdir -p $@/user/profile/{java,extended} $(STATIC_OBJDIR)/user/socket.o: user/socket.c $(SOCKET_TRACE_ELFS) | $(STATIC_OBJDIR) $(call msg,CC,$@) diff --git a/agent/src/ebpf/kernel/include/bpf_base.h b/agent/src/ebpf/kernel/include/bpf_base.h index 7fd9b079170..b233f9422e9 100644 --- a/agent/src/ebpf/kernel/include/bpf_base.h +++ b/agent/src/ebpf/kernel/include/bpf_base.h @@ -261,6 +261,7 @@ _Pragma("GCC error \"PT_GO_REGS_PARM\""); #define KRETPROG(F) SEC("kretprobe/"__stringify(F)) int kretprobe__##F #define KPROG(F) SEC("kprobe/"__stringify(F)) int kprobe__##F #define TPPROG(F) SEC("tracepoint/syscalls/"__stringify(F)) int bpf_func_##F +#define TP_SCHED_PROG(F) SEC("tracepoint/sched/"__stringify(F)) int bpf_func_##F #ifndef CUR_CPU_IDENTIFIER #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0) diff --git a/agent/src/ebpf/kernel/include/perf_profiler.h b/agent/src/ebpf/kernel/include/perf_profiler.h index 242659dfce6..ea2c74dc49b 100644 --- a/agent/src/ebpf/kernel/include/perf_profiler.h +++ b/agent/src/ebpf/kernel/include/perf_profiler.h @@ -37,7 +37,7 @@ typedef enum { ERROR_IDX, /* Count the number of failed push notifications. */ ENABLE_IDX, /* Enable profiler sampling flag. 0: disable sampling; 1: enable sampling. */ - + MINBLOCK_TIME_IDX, /* The minimum blocking time, applied in the profiler extension.*/ PROFILER_CNT } profiler_idx; @@ -49,6 +49,7 @@ struct stack_trace_key_t { int kernstack; int userstack; __u64 timestamp; + __u64 duration_ns; }; #endif /* DF_BPF_PERF_PROFILER_H */ diff --git a/agent/src/ebpf/kernel/perf_profiler.bpf.c b/agent/src/ebpf/kernel/perf_profiler.bpf.c index 0114e600e7a..c64e839bd35 100644 --- a/agent/src/ebpf/kernel/perf_profiler.bpf.c +++ b/agent/src/ebpf/kernel/perf_profiler.bpf.c @@ -124,6 +124,7 @@ int bpf_perf_event(struct bpf_perf_event_data *ctx) key.cpu = bpf_get_smp_processor_id(); bpf_get_current_comm(&key.comm, sizeof(key.comm)); key.timestamp = bpf_ktime_get_ns(); + key.duration_ns = 1; /* * Note: @@ -213,3 +214,4 @@ int bpf_perf_event(struct bpf_perf_event_data *ctx) return 0; } + diff --git a/agent/src/ebpf/mod.rs b/agent/src/ebpf/mod.rs index c6295987d7b..8d3e76c4cdd 100644 --- a/agent/src/ebpf/mod.rs +++ b/agent/src/ebpf/mod.rs @@ -153,6 +153,12 @@ pub const EVENT_TYPE_PROC_EXEC: u32 = 1 << 5; #[allow(dead_code)] pub const EVENT_TYPE_PROC_EXIT: u32 = 1 << 6; +// Profiler types +#[allow(dead_code)] +pub const PROFILER_TYPE_UNKNOWN: u8 = 0; +#[allow(dead_code)] +pub const PROFILER_TYPE_ONCPU: u8 = 1; + //Process exec/exit events #[repr(C)] #[derive(Debug, Copy, Clone)] @@ -367,6 +373,7 @@ pub struct SK_TRACE_STATS { #[repr(C)] #[derive(Debug, Copy, Clone)] pub struct stack_profile_data { + pub profiler_type : u8, // Profiler type, such as 1(PROFILER_TYPE_ONCPU). pub timestamp: u64, // Timestamp of the stack trace data(unit: nanoseconds). pub pid: u32, // User-space process-ID. /* @@ -384,6 +391,10 @@ pub struct stack_profile_data { * The profiler captures the number of occurrences of the same * data by querying with the quadruple * "" as the key. + * Note: + * In the sampling scenario, the number of samples is used; in the + * non-sampling scenario, real-time intervals (in Microseconds) are + * used. */ pub count: u32, /* @@ -647,6 +658,9 @@ extern "C" { timeout: c_int, callback: extern "C" fn(data: *mut c_char, len: c_int), ) -> c_int; + + pub fn enable_oncpu_profiler() -> c_int; + pub fn disable_oncpu_profiler() -> c_int; } #[no_mangle] diff --git a/agent/src/ebpf/tools/code.style b/agent/src/ebpf/tools/code.style new file mode 100644 index 00000000000..ceb7791f998 --- /dev/null +++ b/agent/src/ebpf/tools/code.style @@ -0,0 +1,6 @@ +#! /bin/bash + +indent -npro -kr -i8 -ts8 -nss -nsc -ncs -nprs -sob -l80 -ss -cp1 --space-after-for --space-after-if --space-after-while --space-special-semicolon --blank-lines-after-procedures -v $1 +sed -i "s/{ }/{}/g" $1 +sed -i "s/) ;/);/g" $1 +sed -i "s/^ //g" $1 diff --git a/agent/src/ebpf/user/config.h b/agent/src/ebpf/user/config.h index aa675a33c4d..30853862ac5 100644 --- a/agent/src/ebpf/user/config.h +++ b/agent/src/ebpf/user/config.h @@ -19,7 +19,7 @@ #define EV_NAME_SIZE 1024 -#define BOOT_TIME_UPDATE_PERIOD 60 // 系统启动时间更新周期, 单位:秒 +#define BOOT_TIME_UPDATE_PERIOD 60 // 系统启动时间更新周期, 单位:秒 // eBPF Map Name #define MAP_MEMBERS_OFFSET_NAME "__members_offset" @@ -68,7 +68,7 @@ enum { //thread index for bihash enum { THREAD_PROFILER_READER_IDX = 0, - THREAD_PROC_ACT_IDX_BASE + THREAD_PROC_ACT_IDX_BASE = 2, }; /* @@ -122,7 +122,7 @@ enum { * may lead to increased overhead and memory usage, so it is * recommended to use it with caution. */ -#ifndef PERF_MAX_STACK_DEPTH +#ifndef PERF_MAX_STACK_DEPTH #define PERF_MAX_STACK_DEPTH 127 #endif @@ -131,19 +131,19 @@ enum { */ #define MAP_STACK_A_NAME "__stack_map_a" #define MAP_STACK_B_NAME "__stack_map_b" -#define MAP_PROFILER_STATE_MAP "__profiler_state_map" +#define MAP_PROFILER_STATE_NAME "__profiler_state_map" #define STRINGIFIER_STACK_STR_HASH_BUCKETS_NUM 8192 -#define STRINGIFIER_STACK_STR_HASH_MEM_SZ (1ULL << 30) // 1Gbytes +#define STRINGIFIER_STACK_STR_HASH_MEM_SZ (1ULL << 30) // 1Gbytes #define SYMBOLIZER_CACHES_HASH_BUCKETS_NUM 8192 -#define SYMBOLIZER_CACHES_HASH_MEM_SZ (1ULL << 31) // 2Gbytes +#define SYMBOLIZER_CACHES_HASH_MEM_SZ (1ULL << 31) // 2Gbytes #define STACK_TRACE_MSG_HASH_BUCKETS_NUM 8192 -#define STACK_TRACE_MSG_HASH_MEM_SZ (1ULL << 32) // 4Gbytes +#define STACK_TRACE_MSG_HASH_MEM_SZ (1ULL << 32) // 4Gbytes -#define PROFILER_READER_EPOLL_TIMEOUT 500 //msecs -#define EPOLL_SHORT_TIMEOUT 100 //mescs +#define PROFILER_READER_EPOLL_TIMEOUT 500 //msecs +#define EPOLL_SHORT_TIMEOUT 100 //mescs /* * Process information recalibration time, this time is the number of seconds @@ -152,7 +152,7 @@ enum { * The Java process will delay obtaining the symbol table by * 'PROC_INFO_VERIFY_TIME' seconds after it starts running. */ -#define PROC_INFO_VERIFY_TIME 60 // 60 seconds +#define PROC_INFO_VERIFY_TIME 60 // 60 seconds /* * This value is used to determine which type of Java agent's so library to @@ -168,12 +168,12 @@ enum { * date the Java symbol table. This is done The purpose is to avoid freque- * nt updates of the java symbol table. */ -#define JAVA_SYMS_UPDATE_DELAY_DEF 60 // 60 seconds -#define JAVA_SYMS_UPDATE_DELAY_MIN 5 // 5 seconds -#define JAVA_SYMS_UPDATE_DELAY_MAX 3600 // 3600 seconds +#define JAVA_SYMS_UPDATE_DELAY_DEF 60 // 60 seconds +#define JAVA_SYMS_UPDATE_DELAY_MIN 5 // 5 seconds +#define JAVA_SYMS_UPDATE_DELAY_MAX 3600 // 3600 seconds /* Profiler - maximum data push interval time (in nanosecond). */ -#define MAX_PUSH_MSG_TIME_INTERVAL 1000000000ULL /* 1 seconds */ +#define MAX_PUSH_MSG_TIME_INTERVAL 1000000000ULL /* 1 seconds */ /* * timer config @@ -192,24 +192,24 @@ enum { * the data resident in the eBPF buffer. This value is the periodic time, unit * is milliseconds. */ -#define KICK_KERN_PERIOD 10 // 10 ticks(100 milliseconds) +#define KICK_KERN_PERIOD 10 // 10 ticks(100 milliseconds) /* * System boot time update cycle time, unit is milliseconds. */ -#define SYS_TIME_UPDATE_PERIOD 1000 // 1000 ticks(10 seconds) +#define SYS_TIME_UPDATE_PERIOD 1000 // 1000 ticks(10 seconds) /* * Check whether the eBPF Map exceeds the maximum value and use it to release * stale data (unit is milliseconds). */ -#define CHECK_MAP_EXCEEDED_PERIOD 100 // 100 ticks(1 seconds) +#define CHECK_MAP_EXCEEDED_PERIOD 100 // 100 ticks(1 seconds) /* * Used to check whether the kernel adaptation is successful, here is the * check cycle time (unit is milliseconds). */ -#define CHECK_KERN_ADAPT_PERIOD 100 // 100 ticks(1 seconds) +#define CHECK_KERN_ADAPT_PERIOD 100 // 100 ticks(1 seconds) /* * The maximum space occupied by the Java symbol files in the target POD. @@ -217,15 +217,15 @@ enum { * of 2Mi to 100Mi. If the configuration value is outside this range, the * default value of 10(10Mi), will be used. */ -#define JAVA_POD_WRITE_FILES_SPACE_MIN 2097152 // 2Mi -#define JAVA_POD_WRITE_FILES_SPACE_MAX 104857600 // 100Mi -#define JAVA_POD_WRITE_FILES_SPACE_DEF 10485760 // 10Mi +#define JAVA_POD_WRITE_FILES_SPACE_MIN 2097152 // 2Mi +#define JAVA_POD_WRITE_FILES_SPACE_MAX 104857600 // 100Mi +#define JAVA_POD_WRITE_FILES_SPACE_DEF 10485760 // 10Mi /* * The `df_java_agent_musl.so` and `df_java_agent.so` files will also be * placed in the target POD for loading operations. They occupy less than * 300Ki of space. */ -#define JAVA_POD_EXTRA_SPACE_MMA 307200 // 300Ki +#define JAVA_POD_EXTRA_SPACE_MMA 307200 // 300Ki /* * The perf profiler utilizes a perf buffer (per CPUs) for transporting stack data, @@ -263,6 +263,6 @@ enum { * The random value has a maximum limit specified above(measured in seconds). */ -#define PROFILER_DEFER_RANDOM_MAX 60 // 60 seconds +#define PROFILER_DEFER_RANDOM_MAX 60 // 60 seconds #endif /* DF_EBPF_CONFIG_H */ diff --git a/agent/src/ebpf/user/profile/extended/extended.c b/agent/src/ebpf/user/profile/extended/extended.c new file mode 100644 index 00000000000..fe4ddb92207 --- /dev/null +++ b/agent/src/ebpf/user/profile/extended/extended.c @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024 Yunshan Networks + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include "../../config.h" +#include "../../utils.h" +#include "../../common.h" +#include "../../mem.h" +#include "../../log.h" +#include "../../types.h" +#include "../../vec.h" +#include "../../tracer.h" +#include "../../socket.h" + +int __attribute__ ((weak)) extended_reader_create(struct bpf_tracer *tracer) +{ + return 0; +} diff --git a/agent/src/ebpf/user/profile/extended/extended.h b/agent/src/ebpf/user/profile/extended/extended.h new file mode 100644 index 00000000000..0aa597cc161 --- /dev/null +++ b/agent/src/ebpf/user/profile/extended/extended.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2024 Yunshan Networks + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DF_PROFILE_EXT_H +#define DF_PROFILE_EXT_H +int extended_reader_create(struct bpf_tracer *tracer); +#endif /* DF_PROFILE_EXT_H */ diff --git a/agent/src/ebpf/user/profile/java/gen_syms_file.c b/agent/src/ebpf/user/profile/java/gen_syms_file.c index 042626c3440..0dc31e4a7e6 100644 --- a/agent/src/ebpf/user/profile/java/gen_syms_file.c +++ b/agent/src/ebpf/user/profile/java/gen_syms_file.c @@ -62,8 +62,9 @@ void gen_java_symbols_file(int pid, int *ret_val, bool error_occurred) char args[PERF_PATH_SZ * 2]; snprintf(args, sizeof(args), - "%d %d," DF_AGENT_LOCAL_PATH_FMT ".map," DF_AGENT_LOCAL_PATH_FMT ".log", - pid, g_java_syms_write_bytes_max, pid, pid); + "%d %d," DF_AGENT_LOCAL_PATH_FMT ".map," + DF_AGENT_LOCAL_PATH_FMT ".log", pid, + g_java_syms_write_bytes_max, pid, pid); i64 curr_local_sz; curr_local_sz = get_local_symbol_file_sz(pid, target_ns_pid); @@ -131,6 +132,7 @@ void java_syms_update_main(void *arg) if (task != NULL) { struct symbolizer_proc_info *p = task->p; + symbolizer_proc_lock(p); /* JAVA process has not exited. */ if (AO_GET(&p->use) > 1) { int ret; @@ -151,10 +153,10 @@ void java_syms_update_main(void *arg) AO_SET(&p->new_java_syms_file, true); } - + symbolizer_proc_unlock(p); /* Ensure that all tasks are completed before releasing. */ if (AO_SUB_F(&p->use, 1) == 0) { - clib_mem_free((void *)p); + free_proc_cache(p); } clib_mem_free((void *)task); diff --git a/agent/src/ebpf/user/profile/perf_profiler.c b/agent/src/ebpf/user/profile/perf_profiler.c index 6be2a72f61e..287daa2b6fb 100644 --- a/agent/src/ebpf/user/profile/perf_profiler.c +++ b/agent/src/ebpf/user/profile/perf_profiler.c @@ -44,113 +44,37 @@ #include #include "java/config.h" #include "java/df_jattach.h" +#include "profile_common.h" #include "../perf_profiler_bpf_common.c" -/* - * This section is for symbolization of Java addresses, and we need - * to prepare two so librarys, one for GNU and the other for MUSL: - * - * df_java_agent.so - * df_java_agent_musl.so - * - * These two files need to be saved in the '/tmp' directory, and the - * agent so library will be injected into the JVM to generate - * 'perf-.map'. - */ -#include "java_agent_so_gnu.c" -#include "java_agent_so_musl.c" -/* use for java symbols generate */ -#include "deepflow_jattach_bin.c" - #define LOG_CP_TAG "[CP] " #define CP_TRACER_NAME "continuous_profiler" #define CP_PERF_PG_NUM 16 +static struct profiler_context oncpu_ctx; + /* The maximum bytes limit for writing the df_perf-PID.map file by agent.so */ int g_java_syms_write_bytes_max; +static bool g_enable_oncpu = true; + /* Used for handling updates to JAVA symbol files */ static pthread_t java_syms_update_thread; -extern int major, minor; extern char linux_release[128]; extern __thread uword thread_index; -struct stack_trace_key_t *raw_stack_data; -static u64 stack_trace_lost; -static struct bpf_tracer *profiler_tracer; -static volatile u64 profiler_stop; - -// for stack_trace_msg_hash relese -static __thread stack_trace_msg_hash_kv *trace_msg_kvps; -static __thread bool msg_clear_hash; +struct bpf_tracer *profiler_tracer; // for flame-graph test static FILE *folded_file; #define FOLDED_FILE_PATH "./profiler.folded" -static char *flame_graph_start_time; +char *flame_graph_start_time; static char *flame_graph_end_time; -/* profiler start time(monotonic seconds). */ -static u64 start_time; -/* Record the time of the last data push - * (in seconds since system startup)*/ -static u64 last_push_time; -static u64 push_count; - -/* - * To perform regular expression matching on process names, - * the 'profiler_regex' is set using the 'set_profiler_regex()' - * interface. Processes that successfully match the regular - * expression are aggregated using the key: - * `{pid + stime + u_stack_id + k_stack_id + tid + cpu}`. - * - * For processes that do not match, they are aggregated using the - * key: - * ``. - */ -static regex_t profiler_regex; -static bool regex_existed = false; - -/* - * 'cpu_aggregation_flag' is used to set whether to retrieve CPUID - * and include it in the aggregation of stack trace data. - * - * If valude is set to 1, CPUID will be retrieved and included in - * the aggregation of stack trace data. If value is set to 0, - * CPUID will not be retrieved and will not be included in the - * aggregation. Any other value is considered invalid. - */ -static volatile u64 cpu_aggregation_flag; - -/* - * Cache hash: obtain folded stack trace string from stack ID. - */ -static stack_str_hash_t g_stack_str_hash; - -/* - * Used for tracking data statistics and pushing. - */ -static stack_trace_msg_hash_t g_msg_hash; - -/* - * The iteration count causes BPF to switch buffers with each iteration. - */ -static u64 transfer_count; -static u64 process_count; - -static void print_profiler_status(struct bpf_tracer *t, u64 iter_count, - stack_str_hash_t * h, - stack_trace_msg_hash_t * msg_h); -static void print_cp_tracer_status(struct bpf_tracer *t); - -/* - * During the parsing process, it is possible for processes in procfs - * to be missing (processes that start and exit quickly). This variable - * is used to count the number of lost processes during the parsing process. - */ -static atomic64_t process_lost_count; +static void print_cp_tracer_status(struct bpf_tracer *t, + struct profiler_context *ctx); /* Continuous Profiler debug related settings. */ static pthread_mutex_t cpdbg_mutex; @@ -160,160 +84,33 @@ static bool cpdbg_use_remote; static uint32_t cpdbg_start_time; static uint32_t cpdbg_timeout; -/* Record all stack IDs in each iteration for quick retrieval. */ -struct stack_ids_bitmap stack_ids_a; -struct stack_ids_bitmap stack_ids_b; -/* This vector table is used to remove a stack from the stack map. */ -static int *clear_stack_ids_a; -static int *clear_stack_ids_b; -static u64 stackmap_clear_failed_count; - -/* perf buffer queue loss statistics */ -static u64 perf_buf_lost_a_count; -static u64 perf_buf_lost_b_count; - -static volatile u64 g_enable_perf_sample; - -static u64 get_process_lost_count(void) -{ - return atomic64_read(&process_lost_count); -} - -static inline stack_trace_msg_t *alloc_stack_trace_msg(int len) -{ - void *trace_msg; - trace_msg = clib_mem_alloc_aligned("stack_msg", len, 0, NULL); - if (trace_msg == NULL) { - ebpf_warning("stack trace msg alloc memory failed.\n"); - } else { - stack_trace_msg_t *msg = trace_msg; - return msg; - } - - return NULL; -} - -/* - * The invocation of this interface is always when the process name does - * not match. - */ -static void set_msg_kvp_by_comm(stack_trace_msg_kv_t * kvp, - struct stack_trace_key_t *v, void *msg_value) +static u64 get_process_lost_count(struct profiler_context *ctx) { - strcpy_s_inline(kvp->c_k.comm, sizeof(kvp->c_k.comm), - v->comm, strlen(v->comm)); - kvp->c_k.cpu = v->cpu; - kvp->c_k.pid = v->tgid; - kvp->c_k.reserved = 0; - kvp->msg_ptr = pointer_to_uword(msg_value); -} - -static void set_msg_kvp(stack_trace_msg_kv_t * kvp, - struct stack_trace_key_t *v, u64 stime, void *msg_value) -{ - kvp->k.tgid = v->tgid; - kvp->k.pid = v->pid; - kvp->k.stime = stime; - kvp->k.cpu = v->cpu; - kvp->k.u_stack_id = (u32) v->userstack; - kvp->k.k_stack_id = (u32) v->kernstack; - kvp->msg_ptr = pointer_to_uword(msg_value); -} - -static void set_stack_trace_msg(stack_trace_msg_t * msg, - struct stack_trace_key_t *v, - bool matched, - u64 stime, - u64 ns_id, - const char *process_name, - const char *container_id) -{ - msg->pid = v->tgid; - msg->tid = v->pid; - msg->cpu = v->cpu; - msg->u_stack_id = (u32) v->userstack; - msg->k_stack_id = (u32) v->kernstack; - strcpy_s_inline(msg->comm, sizeof(msg->comm), v->comm, strlen(v->comm)); - msg->stime = stime; - msg->netns_id = ns_id; - if (container_id != NULL) { - strcpy_s_inline(msg->container_id, sizeof(msg->container_id), - container_id, strlen(container_id)); - } - - if (stime > 0) { - /* - * Note: There is no process with PID 0 in procfs. - * If the PID is 0, it will return the kernel's - * startup time, and the process name will be - * obtained from data retrieved through eBPF. - */ - if (msg->pid == 0) { - memcpy(msg->process_name, v->comm, sizeof(msg->comm)); - } else { - if (process_name != NULL) { - strcpy_s_inline(msg->process_name, - sizeof(msg->process_name), - process_name, - strlen(process_name)); - } - } - - } else { - - /* - * If the process has already exited, then execution reaches - * this point, which means aggregating data based on the - * process name. - */ - strcpy_s_inline(msg->process_name, sizeof(msg->process_name), - v->comm, strlen(v->comm)); - atomic64_inc(&process_lost_count); - } - - if (!matched || stime <= 0) { - /* The aggregation method is identified as - * { process name + [u,k]stack_trace_id + cpu} */ - msg->stime = 0; - if (!matched) { - msg->pid = msg->tid = 0; - snprintf((char *)msg->process_name, - sizeof(msg->process_name), "%s", "Total"); - } - } - - msg->time_stamp = gettime(CLOCK_REALTIME, TIME_TYPE_NAN); - msg->count = 1; - msg->data_ptr = pointer_to_uword(&msg->data[0]); - - /* Only use for test flame graph. */ - if (flame_graph_start_time == NULL) { - flame_graph_start_time = gen_file_name_by_datetime(); - } + return atomic64_read(&ctx->process_lost_count); } static void reader_lost_cb_a(void *cookie, u64 lost) { struct bpf_tracer *tracer = profiler_tracer; atomic64_add(&tracer->lost, lost); - perf_buf_lost_a_count++; + oncpu_ctx.perf_buf_lost_a_count++; } static void reader_lost_cb_b(void *cookie, u64 lost) { struct bpf_tracer *tracer = profiler_tracer; atomic64_add(&tracer->lost, lost); - perf_buf_lost_b_count++; + oncpu_ctx.perf_buf_lost_b_count++; } static void reader_raw_cb(void *cookie, void *raw, int raw_size) { - if (unlikely(profiler_stop == 1)) + if (unlikely(oncpu_ctx.profiler_stop == 1)) return; struct reader_forward_info *fwd_info = cookie; if (unlikely(fwd_info->queue_id != 0)) { - ebpf_warning("cookie(%d) error", (u64)cookie); + ebpf_warning("cookie(%d) error", (u64) cookie); return; } @@ -322,7 +119,7 @@ static void reader_raw_cb(void *cookie, void *raw, int raw_size) v = (struct stack_trace_key_t *)raw; int ret = VEC_OK; - vec_add1(raw_stack_data, *v, ret); + vec_add1(oncpu_ctx.raw_stack_data, *v, ret); if (ret != VEC_OK) { ebpf_warning("vec add failed\n"); } @@ -340,7 +137,7 @@ static int release_profiler(struct bpf_tracer *tracer) /* free all readers */ free_all_readers(tracer); - print_cp_tracer_status(tracer); + print_cp_tracer_status(tracer, &oncpu_ctx); /* release object */ release_object(tracer->obj); @@ -350,16 +147,6 @@ static int release_profiler(struct bpf_tracer *tracer) return ETR_OK; } -static int init_stack_trace_msg_hash(stack_trace_msg_hash_t * h, - const char *name) -{ - memset(h, 0, sizeof(*h)); - u32 nbuckets = STACK_TRACE_MSG_HASH_BUCKETS_NUM; - u64 hash_memory_size = STACK_TRACE_MSG_HASH_MEM_SZ; - return stack_trace_msg_hash_init(h, (char *)name, - nbuckets, hash_memory_size); -} - static inline bool is_cpdbg_timeout(void) { uint32_t passed_sec; @@ -391,12 +178,12 @@ static void print_cp_data(stack_trace_msg_t * msg) char buff[DEBUG_BUFF_SIZE]; snprintf(buff, sizeof(buff), - "%s [cpdbg] netns_id %lu container_id %s pid %u tid %u " + "%s [cpdbg] type %d netns_id %lu container_id %s pid %u tid %u " "process_name %s comm %s stime %lu u_stack_id %u k_statck_id" " %u cpu %u count %u tiemstamp %lu datalen %u data %s\n", - timestamp, msg->netns_id, cid, msg->pid, msg->tid, - msg->process_name, msg->comm, msg->stime, - msg->u_stack_id, + timestamp, msg->profiler_type, msg->netns_id, + cid, msg->pid, msg->tid, msg->process_name, msg->comm, + msg->stime, msg->u_stack_id, msg->k_stack_id, msg->cpu, msg->count, msg->time_stamp, msg->data_len, msg->data); @@ -410,7 +197,7 @@ static void print_cp_data(stack_trace_msg_t * msg) } } -static void cpdbg_process(stack_trace_msg_t * msg) +void cpdbg_process(stack_trace_msg_t * msg) { pthread_mutex_lock(&cpdbg_mutex); if (unlikely(cpdbg_enable)) { @@ -421,631 +208,20 @@ static void cpdbg_process(stack_trace_msg_t * msg) pthread_mutex_unlock(&cpdbg_mutex); } -static int push_and_free_msg_kvp_cb(stack_trace_msg_hash_kv * kv, void *ctx) -{ - stack_trace_msg_kv_t *msg_kv = (stack_trace_msg_kv_t *) kv; - if (msg_kv->msg_ptr != 0) { - stack_trace_msg_t *msg = (stack_trace_msg_t *) msg_kv->msg_ptr; - - /* continuous profiler debug */ - cpdbg_process(msg); - - tracer_callback_t fun = profiler_tracer->process_fn; - /* - * Execute callback function to hand over the data to the - * higher level for processing. The higher level will se- - * nd the data to the server for storage as required. - */ - if (likely(profiler_stop == 0)) - fun(msg); - - clib_mem_free((void *)msg); - msg_kv->msg_ptr = 0; - } - - int ret = VEC_OK; - vec_add1(trace_msg_kvps, *kv, ret); - if (ret != VEC_OK) { - ebpf_warning("vec add failed\n"); - msg_clear_hash = true; - } - - return BIHASH_WALK_CONTINUE; -} - -/* - * Push the data and release the resources. - * @is_force: Do you need to perform a forced release? - */ -static void push_and_release_stack_trace_msg(stack_trace_msg_hash_t * h, - bool is_force) -{ - ASSERT(profiler_tracer != NULL); - - u64 curr_time, elapsed; - curr_time = gettime(CLOCK_MONOTONIC, TIME_TYPE_NAN); - elapsed = curr_time - last_push_time; - /* - * If the aggregated stack trace data obtained by the profiler - * satisfies one of the following conditions, it should be pushed - * to the upper-level processing: - * - * If the time interval since the last push exceeds or equals - * the maximum time interval (MAX_PUSH_MSG_TIME_INTERVAL). - * - * Otherwise, it should return directly. - */ - if (!((elapsed >= MAX_PUSH_MSG_TIME_INTERVAL) || is_force)) - return; - - /* update last push time. */ - last_push_time = curr_time; - push_count++; - - stack_trace_msg_hash_foreach_key_value_pair(h, push_and_free_msg_kvp_cb, - NULL); - /* - * In this iteration, all elements will be cleared, and in the - * next iteration, this hash will be reused. - */ - stack_trace_msg_hash_kv *v; - vec_foreach(v, trace_msg_kvps) { - if (stack_trace_msg_hash_add_del(h, v, 0 /* delete */ )) { - ebpf_warning - ("stack_trace_msg_hash_add_del() failed.\n"); - msg_clear_hash = true; - } - } - - vec_free(trace_msg_kvps); - - h->hit_hash_count = 0; - h->hash_elems_count = 0; - - if (msg_clear_hash) { - msg_clear_hash = false; - stack_trace_msg_hash_free(h); - } -} - -static inline void add_stack_id_to_bitmap(int stack_id, bool is_a) -{ - if (stack_id < 0) - return; - - struct stack_ids_bitmap *ids; - if (is_a) - ids = &stack_ids_a; - else - ids = &stack_ids_b; - - if (!is_set_bitmap(ids->bitmap, stack_id)) { - set_bitmap(ids->bitmap, stack_id); - int ret = VEC_OK; - - if (is_a) - vec_add1(clear_stack_ids_a, stack_id, ret); - else - vec_add1(clear_stack_ids_b, stack_id, ret); - - if (ret != VEC_OK) { - ebpf_warning("vec add failed\n"); - } - - ids->count++; - } -} - -static inline void update_matched_process_in_total(stack_trace_msg_hash_t * - msg_hash, - char *process_name, - struct stack_trace_key_t *v) -{ - stack_trace_msg_kv_t kv; - set_msg_kvp_by_comm(&kv, v, (void *)0); - - if (stack_trace_msg_hash_search - (msg_hash, (stack_trace_msg_hash_kv *) & kv, - (stack_trace_msg_hash_kv *) & kv) == 0) { - __sync_fetch_and_add(&msg_hash->hit_hash_count, 1); - ((stack_trace_msg_t *) kv.msg_ptr)->count++; - return; - } - - /* append ';' '\0' and '[p/t]' */ - char trace_str[(TASK_COMM_LEN * 2) + 10]; - bool is_thread = (v->pid != v->tgid); - if (is_thread) - snprintf(trace_str, sizeof(trace_str), "[p] %s;[t] %s", - process_name, v->comm); - else - snprintf(trace_str, sizeof(trace_str), "[p] %s", process_name); - - /* append 2 byte for ';''\0' */ - int len = sizeof(stack_trace_msg_t) + strlen(trace_str) + 2; - stack_trace_msg_t *msg = alloc_stack_trace_msg(len); - if (msg == NULL) { - clib_mem_free(trace_str); - return; - } - - set_stack_trace_msg(msg, v, false, 0, 0, process_name, NULL); - snprintf((char *)&msg->data[0], strlen(trace_str) + 2, "%s", trace_str); - msg->data_len = strlen((char *)msg->data); - kv.msg_ptr = pointer_to_uword(msg); - - if (stack_trace_msg_hash_add_del(msg_hash, - (stack_trace_msg_hash_kv - *) & kv, 1 /* is_add */ )) { - ebpf_warning("stack_trace_msg_hash_add_del() failed.\n"); - clib_mem_free(msg); - } else { - __sync_fetch_and_add(&msg_hash->hash_elems_count, 1); - } -} - -static void aggregate_stack_traces(struct bpf_tracer *t, - const char *stack_map_name, - stack_str_hash_t * stack_str_hash, - stack_trace_msg_hash_t * msg_hash, - u32 * count, bool use_a_map) -{ - struct stack_trace_key_t *v; - vec_foreach(v, raw_stack_data) { - if (v == NULL) - break; - - if (unlikely(profiler_stop == 1)) - break; - - /* - * If cpu_aggregation_flag=0, the CPU value for stack trace data - * reporting is a special value (CPU_INVALID:0xfff) used to indicate - * that it is an invalid value, the CPUID will not be included in - * the aggregation. - */ - if (cpu_aggregation_flag == 0) - v->cpu = CPU_INVALID; - - /* - * Uniform idle process names to reduce the aggregated count of stack - * trace data (when we aggregate using process names as part of the key). - * "swapper/0", "swapper/1", "swapper/2" ... > "swapper" - */ - if (v->pid == v->tgid && v->pid == 0) { - const char *idle_name = "swapper"; - strcpy_s_inline(v->comm, sizeof(v->comm), - idle_name, strlen(idle_name)); - } - - /* -EEXIST: Hash bucket collision in the stack trace table */ - if (v->kernstack == -EEXIST) - stack_trace_lost++; - - if (v->userstack == -EEXIST) - stack_trace_lost++; - - add_stack_id_to_bitmap(v->kernstack, use_a_map); - add_stack_id_to_bitmap(v->userstack, use_a_map); - - /* Total iteration count for this iteration. */ - (*count)++; - - /* Total iteration count for all iterations. */ - process_count++; - - /* - * Firstly, search the stack-trace-msg hash to see if the - * stack trace messages has already been stored. - */ - stack_trace_msg_kv_t kv; - char name[TASK_COMM_LEN]; - memset(name, 0, sizeof(name)); - u64 stime, netns_id; - stime = netns_id = 0; - void *info_p = NULL; - char *process_name = NULL; - bool matched, is_match_finish; - matched = is_match_finish = false; - - /* If it is a process, match operation will be performed immediately. */ - if (v->pid == v->tgid) { - is_match_finish = true; - matched = (regexec(&profiler_regex, v->comm, 0, NULL, 0) - == 0); - if (!matched) { - set_msg_kvp_by_comm(&kv, v, (void *)0); - goto skip_proc_find; - } - } - - get_process_info_by_pid(v->tgid, &stime, &netns_id, - (char *)name, &info_p); - - /* - * If the data collected is from a running process, and the process - * name and the command name of the task (captured by eBPF) are not - * consistent, it indicates that the cached process information is - * no longer valid. - */ - if (stime > 0 && v->pid == v->tgid && strcmp(name, v->comm)) { - stime = netns_id = 0; - name[0] = '\0'; - process_name = NULL; - info_p = NULL; - } - - if (stime > 0) { - if (v->tgid == 0) - process_name = v->comm; - else - process_name = name; - - if (!is_match_finish) - matched = - (regexec - (&profiler_regex, process_name, 0, NULL, 0) - == 0); - if (matched) - set_msg_kvp(&kv, v, stime, (void *)0); - else - set_msg_kvp_by_comm(&kv, v, (void *)0); - } else { - /* Not find process in procfs. */ - set_msg_kvp_by_comm(&kv, v, (void *)0); - } - - /* - * Here, we duplicate the matched process data and place it into - * the Total process, with the aim of showcasing the proportion - * of each process in the overall sampling. - */ - if (matched) - update_matched_process_in_total(msg_hash, process_name, - v); - - skip_proc_find: - if (stack_trace_msg_hash_search - (msg_hash, (stack_trace_msg_hash_kv *) & kv, - (stack_trace_msg_hash_kv *) & kv) == 0) { - __sync_fetch_and_add(&msg_hash->hit_hash_count, 1); - ((stack_trace_msg_t *) kv.msg_ptr)->count++; - continue; - } - - /* - * Folded stack trace string and generate stack trace messages. - * - * Folded stack trace string (taken from a performance profiler test): - * main;xxx();yyy() - * It is a list of symbols corresponding to addresses in the underlying - * stack trace, separated by ';'. - */ - - char *trace_str = - resolve_and_gen_stack_trace_str(t, v, stack_map_name, - stack_str_hash, matched, - process_name, info_p); - if (trace_str) { - /* - * append process/thread name to stack string - * append 2 byte for ';''\0' - * append pre_tag '[p/t]' - */ - char pre_tag[5]; - int str_len = strlen(trace_str) + 2; - if (matched) - str_len += strlen(v->comm) + sizeof(pre_tag); - - int len = sizeof(stack_trace_msg_t) + str_len; - stack_trace_msg_t *msg = alloc_stack_trace_msg(len); - if (msg == NULL) { - clib_mem_free(trace_str); - continue; - } - - memset(msg, 0, len); - struct symbolizer_proc_info *__p = info_p; - set_stack_trace_msg(msg, v, matched, stime, netns_id, - process_name, - __p ? __p->container_id : NULL); - - snprintf(pre_tag, sizeof(pre_tag), "%s ", - v->pid == v->tgid ? "[p]" : "[t]"); - if (matched) - snprintf((char *)&msg->data[0], str_len, - "%s%s;%s", pre_tag, v->comm, - trace_str); - else - snprintf((char *)&msg->data[0], str_len, "%s", - trace_str); - - msg->data_len = strlen((char *)msg->data); - clib_mem_free(trace_str); - kv.msg_ptr = pointer_to_uword(msg); - - if (stack_trace_msg_hash_add_del(msg_hash, - (stack_trace_msg_hash_kv - *) & kv, - 1 /* is_add */ )) { - ebpf_warning - ("stack_trace_msg_hash_add_del() failed.\n"); - clib_mem_free(msg); - } else { - __sync_fetch_and_add - (&msg_hash->hash_elems_count, 1); - } - } - - /* check and clean symbol cache */ - exec_proc_info_cache_update(); - } - - vec_free(raw_stack_data); -} - -void set_enable_perf_sample(struct bpf_tracer *t, u64 enable_flag) -{ - if (bpf_table_set_value(t, MAP_PROFILER_STATE_MAP, - ENABLE_IDX, &enable_flag) == false) { - ebpf_warning("profiler state map update error." - "(%s enable_flag %lu) - %s\n", - MAP_PROFILER_STATE_MAP, - enable_flag, strerror(errno)); - } - - g_enable_perf_sample = enable_flag; - - ebpf_info("%s() success, enable_flag:%d\n", __func__, enable_flag); -} - -static u32 delete_all_stackmap_elems(struct bpf_tracer *tracer, - const char *stack_map_name) -{ - struct ebpf_map *map = - ebpf_obj__get_map_by_name(tracer->obj, stack_map_name); - if (map == NULL) { - ebpf_warning("[%s] map(name:%s) is NULL.\n", __func__, - stack_map_name); - return 0; - } - int map_fd = map->fd; - - u32 key = 0, next_key; - u32 reclaim_count = 0; - u32 find_count = 0; - struct list_head clear_elem_head; - init_list_head(&clear_elem_head); - - while (bpf_get_next_key(map_fd, &key, &next_key) == 0) { - find_count++; - insert_list(&next_key, sizeof(next_key), &clear_elem_head); - key = next_key; - } - - reclaim_count = __reclaim_map(map_fd, &clear_elem_head); - - ebpf_info("[%s] table %s find_count %u reclaim_count :%u\n", - __func__, stack_map_name, find_count, reclaim_count); - - return reclaim_count; -} - -static void cleanup_stackmap(struct bpf_tracer *t, - const char *stack_map_name, bool is_a) -{ - struct stack_ids_bitmap *ids; - int *clear_stack_ids; - u64 *perf_buf_lost_p = NULL; - - if (is_a) { - ids = &stack_ids_a; - clear_stack_ids = clear_stack_ids_a; - perf_buf_lost_p = &perf_buf_lost_a_count; - } else { - ids = &stack_ids_b; - clear_stack_ids = clear_stack_ids_b; - perf_buf_lost_p = &perf_buf_lost_b_count; - } - - if (ids->count != vec_len(clear_stack_ids)) { - ebpf_warning - ("stack_ids.count(%lu) != vec_len(clear_stack_ids)(%d)", - ids->count, vec_len(clear_stack_ids)); - } - - /* - * The perf profiler utilizes a perf buffer (per CPUs) for transporting stack data, - * which may lead to out-of-order behavior in a multi-core environment. - * We have employed a threshold to delay the cleanup of the stack map, reducing the - * occurrence of premature clearing of stack entries caused by the disorder in stack - * data. - * - * Examine the detailed explanation of 'STACKMAP_CLEANUP_THRESHOLD' in - * 'agent/src/ebpf/user/config.h'. - */ - if (ids->count >= STACKMAP_CLEANUP_THRESHOLD) { - int *sid; - vec_foreach(sid, clear_stack_ids) { - int id = *sid; - if (!bpf_table_delete_key(t, stack_map_name, (u64) id)) { - /* - * It may be due to the disorder in the perf buffer transmission, - * leading to the repetitive deletion of the same stack ID. - */ - stackmap_clear_failed_count++; - } - - clear_bitmap(ids->bitmap, id); - } - - if (is_a) - vec_free(clear_stack_ids_a); - else - vec_free(clear_stack_ids_b); - - ids->count = 0; - - /* - * If data loss occurs due to the user-space receiver program - * being too busy and not promptly fetching data from the perf - * buffer, it is necessary to clean the stack map once to prevent - * excessive remnants of stack data from affecting the acquisition - * of new stack data (i.e., eBPF using the bpf_get_stackid() - * interface will return -EEXIST). - */ - if (*perf_buf_lost_p > 0) { - delete_all_stackmap_elems(t, stack_map_name); - *perf_buf_lost_p = 0; - } - } -} - -static void process_bpf_stacktraces(struct bpf_tracer *t, - struct bpf_perf_reader *r_a, - struct bpf_perf_reader *r_b) -{ - struct bpf_perf_reader *r; - const char *stack_map_name; - bool using_map_set_a = (transfer_count % 2 == 0); - r = using_map_set_a ? r_a : r_b; - stack_map_name = using_map_set_a ? MAP_STACK_A_NAME : MAP_STACK_B_NAME; - const u64 sample_count_idx = - using_map_set_a ? SAMPLE_CNT_A_IDX : SAMPLE_CNT_B_IDX; - - struct epoll_event events[r->readers_count]; - int nfds = reader_epoll_wait(r, events, 0); - - transfer_count++; - /* update map MAP_PROFILER_STATE_MAP */ - if (bpf_table_set_value(t, MAP_PROFILER_STATE_MAP, - TRANSFER_CNT_IDX, &transfer_count) == false) { - ebpf_warning("profiler state map update error." - "(%s transfer_count %lu) - %s\n", - MAP_PROFILER_STATE_MAP, - transfer_count, strerror(errno)); - transfer_count--; - } - - /* Total iteration count for this iteration. */ - u32 count = 0; - - /* eBPF map record count for this iteration. */ - u64 sample_cnt_val = 0; - - /* - * Why use g_stack_str_hash? - * - * When the stringizer encounters a stack-ID for the first time in - * the stack trace table, it clears it. If a stack-ID is reused by - * different stack trace keys, the stringizer returns its memoized - * stack trace string. Since stack IDs are unstable between profile - * iterations, we create and destroy the stringizer in each profile - * iteration. - */ - if (unlikely(g_stack_str_hash.buckets == NULL)) { - if (init_stack_str_hash(&g_stack_str_hash, "profile_stack_str")) { - ebpf_warning("init_stack_str_hash() failed.\n"); - return; - } - } - - /* - * During each transmission iteration, we have a hashmap structure in - * place for the following purposes: - * - * 1 Pushing the data of this iteration to the higher-level processing. - * 2 Performing data statistics based on the stack trace data, using the - * combination of "tgid + tgid_start_time + pid + cpu + k_stack_id + - * u_stack_id + " as the key. - * - * Here is the key-value pair structure of the hashmap: - * see perf_profiler.h (stack_trace_msg_kv_t) - * This is the final form of the data. If the current stack trace message - * is a match, we only need to increment the count field in the correspon- - * ding value, thus avoiding duplicate parsing. - */ - if (unlikely(g_msg_hash.buckets == NULL)) { - if (init_stack_trace_msg_hash(&g_msg_hash, "stack_trace_msg")) { - ebpf_warning("init_stack_trace_msg_hash() failed.\n"); - return; - } - } - - if (nfds > 0) { - - check_again: - if (unlikely(profiler_stop == 1)) - goto release_iter; - - /* - * If there is data, the reader's callback - * function will be called. - */ - reader_event_read(events, nfds); - - /* - * After the reader completes data reading, the work of - * data aggregation will be blocked if there is no data. - */ - aggregate_stack_traces(t, stack_map_name, &g_stack_str_hash, - &g_msg_hash, &count, using_map_set_a); - - /* - * To ensure that all data in the perf ring-buffer is procenssed - * in this iteration, as this iteration will clean up all the - * data recorded in the stackmap, any residual data in the perf - * ring-buffer will be carried over to the next iteration for - * processing. This poses a risk of not being able to find the - * corresponding stackmap records in the next iteration, leading - * to incomplete processing. - */ - if (bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, - sample_count_idx, - (void *)&sample_cnt_val)) { - if (sample_cnt_val > count) { - nfds = reader_epoll_short_wait(r, events, 0); - if (nfds > 0) - goto check_again; - } - } - } - -release_iter: - - cleanup_stackmap(t, stack_map_name, using_map_set_a); - - /* Now that we've consumed the data, reset the sample count in BPF. */ - sample_cnt_val = 0; - bpf_table_set_value(t, MAP_PROFILER_STATE_MAP, - sample_count_idx, &sample_cnt_val); - - print_profiler_status(t, count, &g_stack_str_hash, &g_msg_hash); - - /* free all elems */ - clean_stack_strs(&g_stack_str_hash); - - /* Push messages and free stack_trace_msg_hash */ - push_and_release_stack_trace_msg(&g_msg_hash, false); -} - static void java_syms_update_work(void *arg) { java_syms_update_main(arg); } -static void cp_reader_work(void *arg) +static void oncpu_reader_work(void *arg) { thread_index = THREAD_PROFILER_READER_IDX; struct bpf_tracer *t = profiler_tracer; - struct bpf_perf_reader *reader_a, *reader_b; - reader_a = &t->readers[0]; - reader_b = &t->readers[1]; for (;;) { - if (unlikely(profiler_stop == 1)) { - if (g_enable_perf_sample) - set_enable_perf_sample(t, 0); + if (unlikely(oncpu_ctx.profiler_stop == 1)) { + if (oncpu_ctx.enable_bpf_profile) + set_bpf_run_enabled(t, &oncpu_ctx, 0); goto exit; } @@ -1060,45 +236,44 @@ static void cp_reader_work(void *arg) * exit events. We want to ensure that everything is ready * before the profiler performs address translation. */ - if (unlikely(!regex_existed || + if (unlikely(!oncpu_ctx.regex_existed || get_socket_tracer_state() != TRACER_RUNNING)) { - if (g_enable_perf_sample) - set_enable_perf_sample(t, 0); + if (oncpu_ctx.enable_bpf_profile) + set_bpf_run_enabled(t, &oncpu_ctx, 0); exec_proc_info_cache_update(); sleep(1); continue; } - if (unlikely(!g_enable_perf_sample)) - set_enable_perf_sample(t, 1); + if (unlikely(!oncpu_ctx.enable_bpf_profile)) + set_bpf_run_enabled(t, &oncpu_ctx, 1); - tracer_reader_lock(t); - process_bpf_stacktraces(t, reader_a, reader_b); - tracer_reader_unlock(t); + process_bpf_stacktraces(&oncpu_ctx, t); } exit: - print_cp_tracer_status(t); + print_cp_tracer_status(t, &oncpu_ctx); - print_hash_stack_str(&g_stack_str_hash); + print_hash_stack_str(&oncpu_ctx.stack_str_hash); /* free stack_str_hash */ - if (likely(g_stack_str_hash.buckets != NULL)) { - release_stack_str_hash(&g_stack_str_hash); + if (likely(oncpu_ctx.stack_str_hash.buckets != NULL)) { + release_stack_str_hash(&oncpu_ctx.stack_str_hash); } - print_hash_stack_trace_msg(&g_msg_hash); + print_hash_stack_trace_msg(&oncpu_ctx.msg_hash); /* free stack_str_hash */ - if (likely(g_msg_hash.buckets != NULL)) { + if (likely(oncpu_ctx.msg_hash.buckets != NULL)) { /* Ensure that all elements are released properly/cleanly */ - push_and_release_stack_trace_msg(&g_msg_hash, true); - stack_trace_msg_hash_free(&g_msg_hash); + push_and_release_stack_trace_msg(&oncpu_ctx, + &oncpu_ctx.msg_hash, true); + stack_trace_msg_hash_free(&oncpu_ctx.msg_hash); } /* resouce share release */ release_symbol_caches(); /* clear thread */ - t->perf_worker[0] = 0; + t->perf_workers[THREAD_PROFILER_READER_IDX] = 0; ebpf_info(LOG_CP_TAG "perf profiler reader-thread exit.\n"); pthread_exit(NULL); @@ -1114,42 +289,10 @@ static int create_profiler(struct bpf_tracer *tracer) if (tracer_bpf_load(tracer)) return ETR_LOAD; - set_enable_perf_sample(tracer, 0); - - /* - * create reader for read eBPF-profiler data. - * To implement eBPF perf-profiler double buffering output, - * it is necessary to create two readers to correspond to - * the double buffering structure design. - */ - struct bpf_perf_reader *reader_a, *reader_b; - reader_a = create_perf_buffer_reader(tracer, - MAP_PERF_PROFILER_BUF_A_NAME, - reader_raw_cb, - reader_lost_cb_a, - PROFILE_PG_CNT_DEF, 1, - PROFILER_READER_EPOLL_TIMEOUT); - if (reader_a == NULL) - return ETR_NORESOURCE; - - reader_b = create_perf_buffer_reader(tracer, - MAP_PERF_PROFILER_BUF_B_NAME, - reader_raw_cb, - reader_lost_cb_b, - PROFILE_PG_CNT_DEF, 1, - PROFILER_READER_EPOLL_TIMEOUT); - if (reader_b == NULL) { - free_perf_buffer_reader(reader_a); - return ETR_NORESOURCE; - } - /* clear old perf files */ exec_command("/usr/bin/rm -rf /tmp/perf-*.map", ""); exec_command("/usr/bin/rm -rf /tmp/perf-*.log", ""); - /* attach perf event */ - tracer_hooks_attach(tracer); - ret = create_work_thread("java_update", &java_syms_update_thread, (void *)java_syms_update_work, (void *)tracer); @@ -1158,17 +301,67 @@ static int create_profiler(struct bpf_tracer *tracer) goto error; } - /* - * Start a new thread to execute the data - * reading of perf buffer. - */ - ret = enable_tracer_reader_work("cp_reader", 0, tracer, - (void *)&cp_reader_work); + if (g_enable_oncpu) { + ebpf_info(LOG_CP_TAG "=== oncpu profiler enabled ===\n"); + tracer->enable_sample = true; + set_bpf_run_enabled(tracer, &oncpu_ctx, 0); - if (ret) { - goto error; + /* + * create reader for read eBPF-profiler data. + * To implement eBPF perf-profiler double buffering output, + * it is necessary to create two readers to correspond to + * the double buffering structure design. + */ + struct bpf_perf_reader *reader_a, *reader_b; + reader_a = create_perf_buffer_reader(tracer, + MAP_PERF_PROFILER_BUF_A_NAME, + reader_raw_cb, + reader_lost_cb_a, + PROFILE_PG_CNT_DEF, 1, + PROFILER_READER_EPOLL_TIMEOUT); + if (reader_a == NULL) + return ETR_NORESOURCE; + + reader_b = create_perf_buffer_reader(tracer, + MAP_PERF_PROFILER_BUF_B_NAME, + reader_raw_cb, + reader_lost_cb_b, + PROFILE_PG_CNT_DEF, 1, + PROFILER_READER_EPOLL_TIMEOUT); + if (reader_b == NULL) { + free_perf_buffer_reader(reader_a); + return ETR_NORESOURCE; + } + + oncpu_ctx.r_a = reader_a; + oncpu_ctx.r_b = reader_b; + + /* + * Start a new thread to execute the data + * reading of perf buffer. + */ + ret = + enable_tracer_reader_work("oncpu_reader", + THREAD_PROFILER_READER_IDX, + tracer, + (void *)&oncpu_reader_work); + + if (ret) { + goto error; + } + } else { + tracer->enable_sample = false; + ebpf_info(LOG_CP_TAG "=== oncpu profiler disabled ===\n"); } + if (tracer_probes_init(tracer)) + return (-1); + + extended_reader_create(tracer); + + /* attach perf event */ + tracer_hooks_attach(tracer); + return ETR_OK; error: @@ -1176,9 +369,28 @@ static int create_profiler(struct bpf_tracer *tracer) return ETR_INVAL; } +static inline bool all_perf_workers_exit(struct bpf_tracer *t) +{ + int i; + int count = ARRAY_SIZE(t->perf_workers); + for (i = 0; i < count; i++) { + if (t->perf_workers[i]) + return false; + } + + return true; +} + int stop_continuous_profiler(void) { - profiler_stop = 1; + oncpu_ctx.profiler_stop = 1; + if (profiler_tracer == NULL) + return (0); + + // Wait for all reader threads to exit. + while (!all_perf_workers_exit(profiler_tracer)) + sleep(1); + if (flame_graph_end_time == NULL) { flame_graph_end_time = gen_file_name_by_datetime(); } @@ -1188,9 +400,9 @@ int stop_continuous_profiler(void) u64 alloc_b, free_b; get_mem_stat(&alloc_b, &free_b); - if (regex_existed) { - regfree(&profiler_regex); - regex_existed = false; + if (oncpu_ctx.regex_existed) { + regfree(&oncpu_ctx.profiler_regex); + oncpu_ctx.regex_existed = false; } ebpf_info(LOG_CP_TAG "== alloc_b %lu bytes, free_b %lu bytes, " @@ -1198,37 +410,38 @@ int stop_continuous_profiler(void) return (0); } -static void print_cp_tracer_status(struct bpf_tracer *t) +static void print_cp_tracer_status(struct bpf_tracer *t, + struct profiler_context *ctx) { u64 alloc_b, free_b; get_mem_stat(&alloc_b, &free_b); u64 sample_drop_cnt = 0; - if (!bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, SAMPLE_CNT_DROP, + if (!bpf_table_get_value(t, ctx->state_map_name, SAMPLE_CNT_DROP, (void *)&sample_drop_cnt)) { ebpf_warning("Get map '%s' sample_drop_cnt failed.\n", - MAP_PROFILER_STATE_MAP); + ctx->state_map_name); } u64 output_err_cnt = 0; - if (!bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, ERROR_IDX, + if (!bpf_table_get_value(t, ctx->state_map_name, ERROR_IDX, (void *)&output_err_cnt)) { ebpf_warning("Get map '%s' output_err_cnt failed.\n", - MAP_PROFILER_STATE_MAP); + ctx->state_map_name); } u64 output_count = 0; - if (!bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, OUTPUT_CNT_IDX, + if (!bpf_table_get_value(t, ctx->state_map_name, OUTPUT_CNT_IDX, (void *)&output_count)) { ebpf_warning("Get map '%s' output_cnt failed.\n", - MAP_PROFILER_STATE_MAP); + ctx->state_map_name); } u64 iter_max_cnt = 0; - if (!bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, SAMPLE_ITER_CNT_MAX, + if (!bpf_table_get_value(t, ctx->state_map_name, SAMPLE_ITER_CNT_MAX, (void *)&iter_max_cnt)) { ebpf_warning("Get map '%s' iter_max_cnt failed.\n", - MAP_PROFILER_STATE_MAP); + ctx->state_map_name); } ebpf_info("\n\n----------------------------\nrecv envent:\t%lu\n" @@ -1236,7 +449,7 @@ static void print_cp_tracer_status(struct bpf_tracer *t) "perf_buf_lost_b:\t%lu process_lost_count:\t%lu " "stack_table_data_miss:\t%lu\n" "stackmap_clear_failed_count\t%lu\n" - "stack_trace_lost:\t%lu\ntransfer_count:\t%lu " + "stack_trace_err:\t%lu\ntransfer_count:\t%lu " "iter_count_avg:\t%.2lf\nalloc_b:\t%lu bytes " "free_b:\t%lu bytes use:\t%lu bytes\n" "eBPF map status:\n" @@ -1245,77 +458,21 @@ static void print_cp_tracer_status(struct bpf_tracer *t) " - output_err_cnt:\t%lu\n" " - iter_max_cnt:\t%lu\n" "----------------------------\n\n", - atomic64_read(&t->recv), process_count, - atomic64_read(&t->lost), perf_buf_lost_a_count, - perf_buf_lost_b_count, perf_buf_lost_a_count, - perf_buf_lost_b_count, get_process_lost_count(), + atomic64_read(&t->recv), ctx->process_count, + atomic64_read(&t->lost), ctx->perf_buf_lost_a_count, + ctx->perf_buf_lost_b_count, ctx->perf_buf_lost_a_count, + ctx->perf_buf_lost_b_count, get_process_lost_count(ctx), get_stack_table_data_miss_count(), - stackmap_clear_failed_count, stack_trace_lost, transfer_count, - ((double)atomic64_read(&t->recv) / (double)transfer_count), - alloc_b, free_b, alloc_b - free_b, output_count, - sample_drop_cnt, output_err_cnt, iter_max_cnt); -} - -static void print_profiler_status(struct bpf_tracer *t, u64 iter_count, - stack_str_hash_t * h, - stack_trace_msg_hash_t * msg_h) -{ - u64 alloc_b, free_b; - get_mem_stat(&alloc_b, &free_b); - ebpf_debug("\n\n----------------------------\nrecv envent:\t%lu\n" - "kern_lost:\t%lu, perf_buf_lost_a:\t%lu, perf_buf_lost_b:\t%lu\n" - "stack_trace_lost:\t%lu\n" - "stackmap_clear_failed_count\t%lu\n" - "ransfer_count:\t%lu iter_count:\t%lu\nall" - "oc_b:\t%lu bytes free_b:\t%lu bytes use:\t%lu bytes\n" - "stack_str_hash.hit_count %lu\nstack_trace_msg_hash hit %lu\n", - atomic64_read(&t->recv), atomic64_read(&t->lost), - perf_buf_lost_a_count, perf_buf_lost_b_count, - stack_trace_lost, stackmap_clear_failed_count, - transfer_count, iter_count, - alloc_b, free_b, alloc_b - free_b, - h->hit_hash_count, msg_h->hit_hash_count); -} - -/* - * View kernel addresses exposed via /proc and other interfaces - * when /proc/sys/kernel/kptr_restrict has the value 1, it is - * necessary to set the CAP_SYSLOG capability, otherwise all k- - * ernel addresses are set to 0. - * - * This function is used to check if the kernel address is 0. - */ -static bool check_kallsyms_addr_is_zero(void) -{ - const int check_num = 100; - const int max_line_len = 256; - const char *check_str = "0000000000000000"; - - FILE *file = fopen("/proc/kallsyms", "r"); - if (file == NULL) { - ebpf_warning(LOG_CP_TAG "Error opening /proc/kallsyms"); - return false; - } - - char line[max_line_len]; - int count = 0; - - while (fgets(line, sizeof(line), file) != NULL && count < check_num) { - char address[17]; // 16 characters + null terminator - sscanf(line, "%16s", address); - - if (strcmp(address, check_str) == 0) { - count++; - } - } - - fclose(file); - - return (count == check_num); + ctx->stackmap_clear_failed_count, ctx->stack_trace_err, + ctx->transfer_count, + ((double)atomic64_read(&t->recv) / + (double)ctx->transfer_count), alloc_b, free_b, + alloc_b - free_b, output_count, sample_drop_cnt, + output_err_cnt, iter_max_cnt); } static int cpdbg_sockopt_get(sockoptid_t opt, const void *conf, size_t size, - void **out, size_t * outsize) + void **out, size_t *outsize) { return 0; } @@ -1369,6 +526,7 @@ static struct tracer_sockopts cpdbg_sockopts = { * @callback Profile data processing callback interface * @returns 0 on success, < 0 on error */ + int start_continuous_profiler(int freq, int java_syms_space_limit, int java_syms_update_delay, tracer_callback_t callback) @@ -1377,26 +535,13 @@ int start_continuous_profiler(int freq, int java_syms_space_limit, void *bpf_bin_buffer; uword buffer_sz; - // REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). - if (check_kernel_version(4, 9) != 0) { - ebpf_warning - (LOG_CP_TAG - "Currnet linux %d.%d, not support, require Linux 4.9+\n", - major, minor); - + if (!run_conditions_check()) return (-1); - } - if (check_kallsyms_addr_is_zero()) { - ebpf_warning(LOG_CP_TAG - "All kernel addresses in /proc/kallsyms are 0, Please" - " follow the steps below to resolve:\n" - "1 Make sure the content of the '/proc/sys/kernel/kpt" - "r_restrict' file is not 2, if it is 2 please set it " - "to 1.\n2 Add 'CAP_SYSLOG' permission to the containe" - "r.\n3 Restart the pod."); - return (-1); - } + profiler_context_init(&oncpu_ctx, LOG_CP_TAG, + PROFILER_TYPE_ONCPU, g_enable_oncpu, + MAP_PROFILER_STATE_NAME, MAP_STACK_A_NAME, + MAP_STACK_B_NAME, false, false); int java_space_bytes = java_syms_space_limit * 1024 * 1024; if ((java_space_bytes < JAVA_POD_WRITE_FILES_SPACE_MIN) || @@ -1413,86 +558,35 @@ int start_continuous_profiler(int freq, int java_syms_space_limit, set_java_syms_fetch_delay(java_syms_update_delay); ebpf_info("set java_syms_update_delay : %lu\n", java_syms_update_delay); - atomic64_init(&process_lost_count); - /* * Initialize cpdbg */ pthread_mutex_init(&cpdbg_mutex, NULL); - profiler_stop = 0; - start_time = gettime(CLOCK_MONOTONIC, TIME_TYPE_SEC); - // CPUID will not be included in the aggregation of stack trace data. set_profiler_cpu_aggregation(0); - // Java agent so library generation. - if (access(AGENT_LIB_SRC_PATH, F_OK) == 0) { - if (unlink(AGENT_LIB_SRC_PATH) != 0) { - ebpf_warning(LOG_CP_TAG "rm file %s failed.\n", - AGENT_LIB_SRC_PATH); - return (-1); - } - } - - if (access(AGENT_MUSL_LIB_SRC_PATH, F_OK) == 0) { - if (unlink(AGENT_MUSL_LIB_SRC_PATH) != 0) { - ebpf_warning(LOG_CP_TAG "rm file %s failed.\n", - AGENT_MUSL_LIB_SRC_PATH); - return (-1); - } - } - - if (gen_file_from_mem((const char *)java_agent_so_gnu, - sizeof(java_agent_so_gnu), - (const char *)AGENT_LIB_SRC_PATH)) { - ebpf_warning(LOG_CP_TAG - "Java agent so library(%s) generate failed.\n", - AGENT_LIB_SRC_PATH); - return (-1); - } - - if (gen_file_from_mem((const char *)java_agent_so_musl, - sizeof(java_agent_so_musl), - (const char *)AGENT_MUSL_LIB_SRC_PATH)) { - ebpf_warning(LOG_CP_TAG - "Java agent so library(%s) generate failed.\n", - AGENT_MUSL_LIB_SRC_PATH); - return (-1); - } - - /* For java attach tool */ - if (access(JAVA_ATTACH_TOOL_PATH, F_OK) == 0) { - if (unlink(JAVA_ATTACH_TOOL_PATH) != 0) { - ebpf_warning(LOG_CP_TAG "rm file %s failed.\n", - JAVA_ATTACH_TOOL_PATH); - return (-1); - } - } - - if (gen_file_from_mem((const char *)deepflow_jattach_bin, - sizeof(deepflow_jattach_bin), - (const char *)JAVA_ATTACH_TOOL_PATH)) { - ebpf_warning(LOG_CP_TAG - "Java attach tool (%s) generate failed.\n", - JAVA_ATTACH_TOOL_PATH); - return (-1); - } - - if (chmod(JAVA_ATTACH_TOOL_PATH, 0755) < 0) { - ebpf_warning(LOG_CP_TAG - "file '%s' chmod failed.\n", - JAVA_ATTACH_TOOL_PATH); + // Java agent so library generation and tools install. + if (java_libs_and_tools_install() != 0) return (-1); - } snprintf(bpf_load_buffer_name, NAME_LEN, "continuous_profiler"); bpf_bin_buffer = (void *)perf_profiler_common_ebpf_data; buffer_sz = sizeof(perf_profiler_common_ebpf_data); + struct tracer_probes_conf *tps = + malloc(sizeof(struct tracer_probes_conf)); + if (tps == NULL) { + ebpf_warning("malloc() error.\n"); + return -ENOMEM; + } + memset(tps, 0, sizeof(*tps)); + init_list_head(&tps->uprobe_syms_head); + CP_PROFILE_SET_PROBES(tps); + struct bpf_tracer *tracer = setup_bpf_tracer(CP_TRACER_NAME, bpf_load_buffer_name, - bpf_bin_buffer, buffer_sz, NULL, 0, + bpf_bin_buffer, buffer_sz, tps, 0, release_profiler, create_profiler, (void *)callback, freq); if (tracer == NULL) @@ -1554,7 +648,7 @@ void release_flame_graph_hash(void) "<<< stack_count %lu add_count %lu hit_count %lu msg_ptr_zero" "_count %lu push_count %lu >>>\n", stack_count, test_add_count, test_hit_count, msg_ptr_zero_count, - push_count); + oncpu_ctx.push_count); ebpf_info(LOG_CP_TAG "Please use the following command to generate a flame graph:" @@ -1578,42 +672,21 @@ int set_profiler_regex(const char *pattern) return (-1); } + if (!g_enable_oncpu) { + ebpf_warning(LOG_CP_TAG + "'profiler_regex' cannot be set while on-CPU is currently disabled.\n"); + return (-1); + } + /* * During the data processing, the thread responsible for matching reads the * regular expression, while the thread handling the regular expression upd- * ates is different. Synchronization is implemented to ensure protection and * coordination between these two threads. */ - tracer_reader_lock(profiler_tracer); - if (*pattern == '\0') { - regex_existed = false; - ebpf_warning(LOG_CP_TAG - "Set 'profiler_regex' pattern : '', an empty" - " regular expression will not generate any stack data." - "Please configure the regular expression for profiler.\n"); - tracer_reader_unlock(profiler_tracer); - return (0); - } - - if (regex_existed) { - regfree(&profiler_regex); - } - - int ret = regcomp(&profiler_regex, pattern, REG_EXTENDED); - if (ret != 0) { - char error_buffer[100]; - regerror(ret, &profiler_regex, error_buffer, - sizeof(error_buffer)); - ebpf_warning(LOG_CP_TAG - "Pattern %s failed to compile the regular " - "expression: %s\n", pattern, error_buffer); - regex_existed = false; - tracer_reader_unlock(profiler_tracer); - return (-1); - } - - regex_existed = true; - tracer_reader_unlock(profiler_tracer); + profile_regex_lock(&oncpu_ctx); + do_profiler_regex_config(pattern, &oncpu_ctx); + profile_regex_unlock(&oncpu_ctx); ebpf_info(LOG_CP_TAG "Set 'profiler_regex' successful, pattern : '%s'", pattern); return (0); @@ -1627,7 +700,7 @@ int set_profiler_cpu_aggregation(int flag) return (-1); } - cpu_aggregation_flag = (u64) flag; + oncpu_ctx.cpu_aggregation_flag = (u64) flag; ebpf_info(LOG_CP_TAG "Set 'cpu_aggregation_flag' successful, value %d\n", flag); @@ -1676,6 +749,20 @@ int cpdbg_set_config(int timeout, debug_callback_t cb) return 0; } +int enable_oncpu_profiler(void) +{ + g_enable_oncpu = true; + ebpf_info(LOG_CP_TAG "Set oncpu profiler enable.\n"); + return 0; +} + +int disable_oncpu_profiler(void) +{ + g_enable_oncpu = false; + ebpf_info(LOG_CP_TAG "Set oncpu profiler distable.\n"); + return 0; +} + #else /* defined AARCH64_MUSL */ #include "../tracer.h" #include "perf_profiler.h" @@ -1718,7 +805,8 @@ struct bpf_tracer *get_profiler_tracer(void) return NULL; } -void set_enable_perf_sample(struct bpf_tracer *t, u64 enable_flag) +void set_bpf_run_enabled(struct bpf_tracer *t, struct profiler_context *ctx, + u64 enable_flag) { } @@ -1726,4 +814,14 @@ int cpdbg_set_config(int timeout, debug_callback_t cb) { } +int enable_oncpu_profiler(void) +{ + return 0; +} + +int disable_oncpu_profiler(void) +{ + return 0; +} + #endif /* AARCH64_MUSL */ diff --git a/agent/src/ebpf/user/profile/perf_profiler.h b/agent/src/ebpf/user/profile/perf_profiler.h index 76551a57ec2..75176329ad1 100644 --- a/agent/src/ebpf/user/profile/perf_profiler.h +++ b/agent/src/ebpf/user/profile/perf_profiler.h @@ -16,6 +16,8 @@ #ifndef DF_USER_PERF_PROFILER_H #define DF_USER_PERF_PROFILER_H +#define CP_PROFILE_SET_PROBES(T) +#include "extended/extended.h" #include "../bihash_24_8.h" #include "../../kernel/include/perf_profiler.h" @@ -49,22 +51,20 @@ typedef struct { union { struct { /* - * tgid:(max 67,108,864) - * The tgid (Thread Group ID) in kernel space - * is equivalent to the process ID in user space. - * pid:(max 67,108,864) - * The process ID or thread ID in kernel space. - * cpu: (max 4,096) - * Which CPU core does the perf event occur on? - */ - u64 tgid: 26, - pid: 26, - cpu: 12; + * tgid:(max 67,108,864) + * The tgid (Thread Group ID) in kernel space + * is equivalent to the process ID in user space. + * pid:(max 67,108,864) + * The process ID or thread ID in kernel space. + * cpu: (max 4,096) + * Which CPU core does the perf event occur on? + */ + u64 tgid:26, pid:26, cpu:12; /* * process start time(the number of millisecond * elapsed since January 1, 1970 00:00:00). - */ + */ u64 stime; u32 u_stack_id; u32 k_stack_id; @@ -73,19 +73,25 @@ typedef struct { /* Matching and combining for process/thread name. */ struct { u8 comm[TASK_COMM_LEN]; - u64 pid: 26, - reserved: 26, - cpu: 12; + u64 pid:26, reserved:26, cpu:12; } c_k; }; /* Store perf profiler data */ uword msg_ptr; -} stack_trace_msg_kv_t; +} stack_trace_msg_kv_t; + +enum { + PROFILER_TYPE_UNKNOWN, + PROFILER_TYPE_ONCPU, + PROFILER_TYPE_NUM, +}; /* * stack trace message value, push data * + * @profiler_type + * Profiler type, such as PROFILER_TYPE_ONCPU. * @time_stamp * Timestamp of the stack trace data(unit: nanoseconds). * @pid @@ -106,6 +112,10 @@ typedef struct { * The profiler captures the number of occurrences of the same * data by querying with the quadruple * "" as the key. + * 1.1936h + * In the sampling scenario, the number of samples is used; in the + * non-sampling scenario, real-time intervals (in Microseconds) are + * used. Range: [1, 2^32-1)us * @comm * comm in task_struct(linux kernel), always 16 bytes * If the capture is a process, fill in the process name here. @@ -125,6 +135,7 @@ typedef struct { * + ";" + */ typedef struct { + u8 profiler_type; u64 time_stamp; u32 pid; u32 tid; @@ -151,10 +162,11 @@ int stop_continuous_profiler(void); int start_continuous_profiler(int freq, int java_syms_space_limit, int java_syms_update_delay, tracer_callback_t callback); -void process_stack_trace_data_for_flame_graph(stack_trace_msg_t *val); +void process_stack_trace_data_for_flame_graph(stack_trace_msg_t * val); void release_flame_graph_hash(void); int set_profiler_regex(const char *pattern); int set_profiler_cpu_aggregation(int flag); struct bpf_tracer *get_profiler_tracer(void); void set_enable_perf_sample(struct bpf_tracer *t, u64 enable_flag); +void cpdbg_process(stack_trace_msg_t * msg); #endif /* DF_USER_PERF_PROFILER_H */ diff --git a/agent/src/ebpf/user/profile/profile_common.c b/agent/src/ebpf/user/profile/profile_common.c new file mode 100644 index 00000000000..08f502d43e4 --- /dev/null +++ b/agent/src/ebpf/user/profile/profile_common.c @@ -0,0 +1,1067 @@ +/* + * Copyright (c) 2024 Yunshan Networks + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * View kernel addresses exposed via /proc and other interfaces + * when /proc/sys/kernel/kptr_restrict has the value 1, it is + * necessary to set the CAP_SYSLOG capability, otherwise all k- + * ernel addresses are set to 0. + * + * This function is used to check if the kernel address is 0. + */ + +#include +#include +#include "../config.h" +#include "../utils.h" +#include "../common.h" +#include "../mem.h" +#include "../log.h" +#include "../types.h" +#include "../vec.h" +#include "../tracer.h" +#include "../socket.h" +#include "java/gen_syms_file.h" +#include "perf_profiler.h" +#include "../elf.h" +#include "../load.h" +#include "../../kernel/include/perf_profiler.h" +#include "../perf_reader.h" +#include "../bihash_8_8.h" +#include "stringifier.h" +#include "../table.h" +#include +#include "java/config.h" +#include "java/df_jattach.h" +#include "profile_common.h" + +/* + * This section is for symbolization of Java addresses, and we need + * to prepare two so librarys, one for GNU and the other for MUSL: + * + * df_java_agent.so + * df_java_agent_musl.so + * + * These two files need to be saved in the '/tmp' directory, and the + * agent so library will be injected into the JVM to generate + * 'perf-.map'. + */ +#include "java_agent_so_gnu.c" +#include "java_agent_so_musl.c" +/* use for java symbols generate */ +#include "deepflow_jattach_bin.c" + +extern struct bpf_tracer *profiler_tracer; +extern char *flame_graph_start_time; +extern int major, minor; + +static bool java_installed; + +int profiler_context_init(struct profiler_context *ctx, + const char *tag, u8 type, + bool enable_profiler, + const char *state_map_name, + const char *stack_map_name_a, + const char *stack_map_name_b, + bool only_matched, bool use_delta_time) +{ + memset(ctx, 0, sizeof(struct profiler_context)); + ctx->tag = tag; + atomic64_init(&ctx->process_lost_count); + if (!enable_profiler) + ctx->profiler_stop = 1; + + snprintf(ctx->state_map_name, sizeof(ctx->state_map_name), "%s", + state_map_name); + snprintf(ctx->stack_map_name_a, sizeof(ctx->stack_map_name_a), "%s", + stack_map_name_a); + snprintf(ctx->stack_map_name_b, sizeof(ctx->stack_map_name_b), "%s", + stack_map_name_b); + ctx->regex_existed = false; + ctx->only_matched_data = only_matched; + ctx->use_delta_time = use_delta_time; + ctx->type = type; + + return 0; +} + +void set_bpf_run_enabled(struct bpf_tracer *t, struct profiler_context *ctx, + u64 enable_flag) +{ + if (ctx->profiler_stop == 1) + return; + + if (bpf_table_set_value(t, ctx->state_map_name, + ENABLE_IDX, &enable_flag) == false) { + ebpf_warning("%sprofiler state map update error." + "(%s enable_flag %lu) - %s\n", + ctx->tag, ctx->state_map_name, enable_flag, + strerror(errno)); + return; + } + + ctx->enable_bpf_profile = enable_flag; + + ebpf_info("%s%s() success, enable_flag:%d\n", ctx->tag, __func__, + enable_flag); +} + +int do_profiler_regex_config(const char *pattern, struct profiler_context *ctx) +{ + if (*pattern == '\0') { + ctx->regex_existed = false; + ebpf_warning("%sSet 'profiler_regex' pattern : '', an empty" + " regular expression will not generate any stack data." + "Please configure the regular expression for profiler.\n", + ctx->tag); + return (0); + } + + if (ctx->regex_existed) { + regfree(&ctx->profiler_regex); + } + + int ret = regcomp(&ctx->profiler_regex, pattern, REG_EXTENDED); + if (ret != 0) { + char error_buffer[100]; + regerror(ret, &ctx->profiler_regex, error_buffer, + sizeof(error_buffer)); + ebpf_warning("%sPattern %s failed to compile the regular " + "expression: %s\n", ctx->tag, pattern, + error_buffer); + ctx->regex_existed = false; + return (-1); + } + + ctx->regex_existed = true; + return 0; +} + +static bool check_kallsyms_addr_is_zero(void) +{ + const int check_num = 100; + const int max_line_len = 256; + const char *check_str = "0000000000000000"; + + FILE *file = fopen("/proc/kallsyms", "r"); + if (file == NULL) { + ebpf_warning("Error opening /proc/kallsyms"); + return false; + } + + char line[max_line_len]; + int count = 0; + + while (fgets(line, sizeof(line), file) != NULL && count < check_num) { + char address[17]; // 16 characters + null terminator + sscanf(line, "%16s", address); + + if (strcmp(address, check_str) == 0) { + count++; + } + } + + fclose(file); + + return (count == check_num); +} + +bool run_conditions_check(void) +{ + // REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support). + if (check_kernel_version(4, 9) != 0) { + ebpf_warning + ("Currnet linux %d.%d, not support, require Linux 4.9+\n", + major, minor); + + return false; + } + + if (check_kallsyms_addr_is_zero()) { + ebpf_warning + ("All kernel addresses in /proc/kallsyms are 0, Please" + " follow the steps below to resolve:\n" + "1 Make sure the content of the '/proc/sys/kernel/kpt" + "r_restrict' file is not 2, if it is 2 please set it " + "to 1.\n2 Add 'CAP_SYSLOG' permission to the containe" + "r.\n3 Restart the pod."); + return false; + } + + return true; +} + +int java_libs_and_tools_install(void) +{ + if (java_installed) + return (0); + + // Java agent so library generation. + if (access(AGENT_LIB_SRC_PATH, F_OK) == 0) { + if (unlink(AGENT_LIB_SRC_PATH) != 0) { + ebpf_warning("rm file %s failed.\n", + AGENT_LIB_SRC_PATH); + return (-1); + } + } + + if (access(AGENT_MUSL_LIB_SRC_PATH, F_OK) == 0) { + if (unlink(AGENT_MUSL_LIB_SRC_PATH) != 0) { + ebpf_warning("rm file %s failed.\n", + AGENT_MUSL_LIB_SRC_PATH); + return (-1); + } + } + + if (gen_file_from_mem((const char *)java_agent_so_gnu, + sizeof(java_agent_so_gnu), + (const char *)AGENT_LIB_SRC_PATH)) { + ebpf_warning("Java agent so library(%s) generate failed.\n", + AGENT_LIB_SRC_PATH); + return (-1); + } + + if (gen_file_from_mem((const char *)java_agent_so_musl, + sizeof(java_agent_so_musl), + (const char *)AGENT_MUSL_LIB_SRC_PATH)) { + ebpf_warning("Java agent so library(%s) generate failed.\n", + AGENT_MUSL_LIB_SRC_PATH); + return (-1); + } + + /* For java attach tool */ + if (access(JAVA_ATTACH_TOOL_PATH, F_OK) == 0) { + if (unlink(JAVA_ATTACH_TOOL_PATH) != 0) { + ebpf_warning("rm file %s failed.\n", + JAVA_ATTACH_TOOL_PATH); + return (-1); + } + } + + if (gen_file_from_mem((const char *)deepflow_jattach_bin, + sizeof(deepflow_jattach_bin), + (const char *)JAVA_ATTACH_TOOL_PATH)) { + ebpf_warning("Java attach tool (%s) generate failed.\n", + JAVA_ATTACH_TOOL_PATH); + return (-1); + } + + if (chmod(JAVA_ATTACH_TOOL_PATH, 0755) < 0) { + ebpf_warning("file '%s' chmod failed.\n", + JAVA_ATTACH_TOOL_PATH); + return (-1); + } + + java_installed = true; + + return (0); +} + +static u32 delete_all_stackmap_elems(struct bpf_tracer *tracer, + const char *stack_map_name) +{ + struct ebpf_map *map = + ebpf_obj__get_map_by_name(tracer->obj, stack_map_name); + if (map == NULL) { + ebpf_warning("[%s] map(name:%s) is NULL.\n", __func__, + stack_map_name); + return 0; + } + int map_fd = map->fd; + + u32 key = 0, next_key; + u32 reclaim_count = 0; + u32 find_count = 0; + struct list_head clear_elem_head; + init_list_head(&clear_elem_head); + + while (bpf_get_next_key(map_fd, &key, &next_key) == 0) { + find_count++; + insert_list(&next_key, sizeof(next_key), &clear_elem_head); + key = next_key; + } + + reclaim_count = __reclaim_map(map_fd, &clear_elem_head); + + ebpf_info("[%s] table %s find_count %u reclaim_count :%u\n", + __func__, stack_map_name, find_count, reclaim_count); + + return reclaim_count; +} + +static void cleanup_stackmap(struct profiler_context *ctx, struct bpf_tracer *t, + const char *stack_map_name, bool is_a) +{ + struct stack_ids_bitmap *ids; + int *clear_stack_ids; + u64 *perf_buf_lost_p = NULL; + + if (is_a) { + ids = &ctx->stack_ids_a; + clear_stack_ids = ctx->clear_stack_ids_a; + perf_buf_lost_p = &ctx->perf_buf_lost_a_count; + } else { + ids = &ctx->stack_ids_b; + clear_stack_ids = ctx->clear_stack_ids_b; + perf_buf_lost_p = &ctx->perf_buf_lost_b_count; + } + + if (ids->count != vec_len(clear_stack_ids)) { + ebpf_warning + ("%sstack_ids.count(%lu) != vec_len(clear_stack_ids)(%d)", + ctx->tag, ids->count, vec_len(clear_stack_ids)); + } + + /* + * The perf profiler utilizes a perf buffer (per CPUs) for transporting stack data, + * which may lead to out-of-order behavior in a multi-core environment. + * We have employed a threshold to delay the cleanup of the stack map, reducing the + * occurrence of premature clearing of stack entries caused by the disorder in stack + * data. + * + * Examine the detailed explanation of 'STACKMAP_CLEANUP_THRESHOLD' in + * 'agent/src/ebpf/user/config.h'. + */ + if (ids->count >= STACKMAP_CLEANUP_THRESHOLD) { + int *sid; + vec_foreach(sid, clear_stack_ids) { + int id = *sid; + if (!bpf_table_delete_key(t, stack_map_name, (u64) id)) { + /* + * It may be due to the disorder in the perf buffer transmission, + * leading to the repetitive deletion of the same stack ID. + */ + ctx->stackmap_clear_failed_count++; + } + + clear_bitmap(ids->bitmap, id); + } + + if (is_a) + vec_free(ctx->clear_stack_ids_a); + else + vec_free(ctx->clear_stack_ids_b); + + ids->count = 0; + + /* + * If data loss occurs due to the user-space receiver program + * being too busy and not promptly fetching data from the perf + * buffer, it is necessary to clean the stack map once to prevent + * excessive remnants of stack data from affecting the acquisition + * of new stack data (i.e., eBPF using the bpf_get_stackid() + * interface will return -EEXIST). + */ + if (*perf_buf_lost_p > 0) { + delete_all_stackmap_elems(t, stack_map_name); + *perf_buf_lost_p = 0; + } + } +} + +static void print_profiler_status(struct profiler_context *ctx, + struct bpf_tracer *t, u64 iter_count) +{ + u64 alloc_b, free_b; + get_mem_stat(&alloc_b, &free_b); + ebpf_debug("\n\n----------------------------\n%srecv envent:\t%lu\n" + "kern_lost:\t%lu, perf_buf_lost_a:\t%lu, perf_buf_lost_b:\t%lu\n" + "stack_trace_err:\t%lu\n" + "stackmap_clear_failed_count\t%lu\n" + "ransfer_count:\t%lu iter_count:\t%lu\nall" + "oc_b:\t%lu bytes free_b:\t%lu bytes use:\t%lu bytes\n" + "stack_str_hash.hit_count %lu\nstack_trace_msg_hash hit %lu\n", + ctx->tag, atomic64_read(&t->recv), atomic64_read(&t->lost), + ctx->perf_buf_lost_a_count, ctx->perf_buf_lost_b_count, + ctx->stack_trace_err, ctx->stackmap_clear_failed_count, + ctx->transfer_count, iter_count, + alloc_b, free_b, alloc_b - free_b, + ctx->stack_str_hash.hit_hash_count, + ctx->msg_hash.hit_hash_count); +} + +static int push_and_free_msg_kvp_cb(stack_trace_msg_hash_kv * kv, void *arg) +{ + struct profiler_context *ctx = arg; + stack_trace_msg_kv_t *msg_kv = (stack_trace_msg_kv_t *) kv; + if (msg_kv->msg_ptr != 0) { + stack_trace_msg_t *msg = (stack_trace_msg_t *) msg_kv->msg_ptr; + + /* continuous profiler debug */ + cpdbg_process(msg); + + tracer_callback_t fun = profiler_tracer->process_fn; + /* + * Execute callback function to hand over the data to the + * higher level for processing. The higher level will se- + * nd the data to the server for storage as required. + */ + if (likely(ctx->profiler_stop == 0)) + fun(msg); + + clib_mem_free((void *)msg); + msg_kv->msg_ptr = 0; + } + + int ret = VEC_OK; + vec_add1(ctx->trace_msg_kvps, *kv, ret); + if (ret != VEC_OK) { + ebpf_warning("vec add failed\n"); + ctx->msg_clear_hash = true; + } + + return BIHASH_WALK_CONTINUE; +} + +/* + * Push the data and release the resources. + * @is_force: Do you need to perform a forced release? + */ +void push_and_release_stack_trace_msg(struct profiler_context *ctx, + stack_trace_msg_hash_t * h, bool is_force) +{ + ASSERT(profiler_tracer != NULL); + + u64 curr_time, elapsed; + curr_time = gettime(CLOCK_MONOTONIC, TIME_TYPE_NAN); + elapsed = curr_time - ctx->last_push_time; + + /* + * If the aggregated stack trace data obtained by the profiler + * satisfies one of the following conditions, it should be pushed + * to the upper-level processing: + * + * If the time interval since the last push exceeds or equals + * the maximum time interval (MAX_PUSH_MSG_TIME_INTERVAL). + * + * Otherwise, it should return directly. + */ + if (!((elapsed >= MAX_PUSH_MSG_TIME_INTERVAL) || is_force)) + return; + + /* update last push time. */ + ctx->last_push_time = curr_time; + ctx->push_count++; + + stack_trace_msg_hash_foreach_key_value_pair(h, push_and_free_msg_kvp_cb, + (void *)ctx); + /* + * In this iteration, all elements will be cleared, and in the + * next iteration, this hash will be reused. + */ + stack_trace_msg_hash_kv *v; + vec_foreach(v, ctx->trace_msg_kvps) { + if (stack_trace_msg_hash_add_del(h, v, 0 /* delete */ )) { + ebpf_warning + ("%sstack_trace_msg_hash_add_del() failed.\n", + ctx->tag); + ctx->msg_clear_hash = true; + } + } + + vec_free(ctx->trace_msg_kvps); + + h->hit_hash_count = 0; + h->hash_elems_count = 0; + + if (ctx->msg_clear_hash) { + ctx->msg_clear_hash = false; + stack_trace_msg_hash_free(h); + } +} + +static int init_stack_trace_msg_hash(stack_trace_msg_hash_t * h, + const char *name) +{ + memset(h, 0, sizeof(*h)); + u32 nbuckets = STACK_TRACE_MSG_HASH_BUCKETS_NUM; + u64 hash_memory_size = STACK_TRACE_MSG_HASH_MEM_SZ; + return stack_trace_msg_hash_init(h, (char *)name, + nbuckets, hash_memory_size); +} + +static void add_stack_id_to_bitmap(struct profiler_context *ctx, + int stack_id, bool is_a) +{ + if (stack_id < 0) + return; + + struct stack_ids_bitmap *ids; + if (is_a) + ids = &ctx->stack_ids_a; + else + ids = &ctx->stack_ids_b; + + if (!is_set_bitmap(ids->bitmap, stack_id)) { + set_bitmap(ids->bitmap, stack_id); + int ret = VEC_OK; + + if (is_a) + vec_add1(ctx->clear_stack_ids_a, stack_id, ret); + else + vec_add1(ctx->clear_stack_ids_b, stack_id, ret); + + if (ret != VEC_OK) { + ebpf_warning("%svec add failed\n", ctx->tag); + } + + ids->count++; + } +} + +/* + * The invocation of this interface is always when the process name does + * not match. + */ +static void set_msg_kvp_by_comm(stack_trace_msg_kv_t * kvp, + struct stack_trace_key_t *v, void *msg_value) +{ + strcpy_s_inline(kvp->c_k.comm, sizeof(kvp->c_k.comm), + v->comm, strlen(v->comm)); + kvp->c_k.cpu = v->cpu; + kvp->c_k.pid = v->tgid; + kvp->c_k.reserved = 0; + kvp->msg_ptr = pointer_to_uword(msg_value); +} + +static void set_msg_kvp(stack_trace_msg_kv_t * kvp, + struct stack_trace_key_t *v, u64 stime, void *msg_value) +{ + kvp->k.tgid = v->tgid; + kvp->k.pid = v->pid; + kvp->k.stime = stime; + kvp->k.cpu = v->cpu; + kvp->k.u_stack_id = (u32) v->userstack; + kvp->k.k_stack_id = (u32) v->kernstack; + kvp->msg_ptr = pointer_to_uword(msg_value); +} + +static void set_stack_trace_msg(struct profiler_context *ctx, + stack_trace_msg_t * msg, + struct stack_trace_key_t *v, + bool matched, + u64 stime, + u64 ns_id, + const char *process_name, + const char *container_id) +{ + msg->pid = v->tgid; + msg->tid = v->pid; + msg->cpu = v->cpu; + msg->u_stack_id = (u32) v->userstack; + msg->k_stack_id = (u32) v->kernstack; + strcpy_s_inline(msg->comm, sizeof(msg->comm), v->comm, strlen(v->comm)); + msg->stime = stime; + msg->netns_id = ns_id; + msg->profiler_type = ctx->type; + if (container_id != NULL) { + strcpy_s_inline(msg->container_id, sizeof(msg->container_id), + container_id, strlen(container_id)); + } + + if (stime > 0) { + /* + * Note: There is no process with PID 0 in procfs. + * If the PID is 0, it will return the kernel's + * startup time, and the process name will be + * obtained from data retrieved through eBPF. + */ + if (msg->pid == 0) { + memcpy(msg->process_name, v->comm, sizeof(msg->comm)); + } else { + if (process_name != NULL) { + strcpy_s_inline(msg->process_name, + sizeof(msg->process_name), + process_name, + strlen(process_name)); + } + } + + } else { + + /* + * If the process has already exited, then execution reaches + * this point, which means aggregating data based on the + * process name. + */ + strcpy_s_inline(msg->process_name, sizeof(msg->process_name), + v->comm, strlen(v->comm)); + atomic64_inc(&ctx->process_lost_count); + } + + if (!matched || stime <= 0) { + /* The aggregation method is identified as + * { process name + [u,k]stack_trace_id + cpu} */ + msg->stime = 0; + if (!matched) { + msg->pid = msg->tid = 0; + snprintf((char *)msg->process_name, + sizeof(msg->process_name), "%s", "Total"); + } + } + + msg->time_stamp = gettime(CLOCK_REALTIME, TIME_TYPE_NAN); + if (ctx->use_delta_time) + // Using microseconds for storage. + msg->count = v->duration_ns / 1000; + else + msg->count = 1; + msg->data_ptr = pointer_to_uword(&msg->data[0]); + + /* Only use for test flame graph. */ + if (flame_graph_start_time == NULL) { + flame_graph_start_time = gen_file_name_by_datetime(); + } +} + +static inline stack_trace_msg_t *alloc_stack_trace_msg(int len) +{ + void *trace_msg; + trace_msg = clib_mem_alloc_aligned("stack_msg", len, 0, NULL); + if (trace_msg == NULL) { + ebpf_warning("stack trace msg alloc memory failed.\n"); + } else { + stack_trace_msg_t *msg = trace_msg; + return msg; + } + + return NULL; +} + +static inline void update_matched_process_in_total(struct profiler_context *ctx, + stack_trace_msg_hash_t * + msg_hash, + char *process_name, + struct stack_trace_key_t *v) +{ + stack_trace_msg_kv_t kv; + set_msg_kvp_by_comm(&kv, v, (void *)0); + + if (stack_trace_msg_hash_search + (msg_hash, (stack_trace_msg_hash_kv *) & kv, + (stack_trace_msg_hash_kv *) & kv) == 0) { + __sync_fetch_and_add(&msg_hash->hit_hash_count, 1); + ((stack_trace_msg_t *) kv.msg_ptr)->count++; + return; + } + + /* append ';' '\0' and '[p/t]' */ + char trace_str[(TASK_COMM_LEN * 2) + 10]; + bool is_thread = (v->pid != v->tgid); + if (is_thread) + snprintf(trace_str, sizeof(trace_str), "[p] %s;[t] %s", + process_name, v->comm); + else + snprintf(trace_str, sizeof(trace_str), "[p] %s", process_name); + + /* append 2 byte for ';''\0' */ + int len = sizeof(stack_trace_msg_t) + strlen(trace_str) + 2; + stack_trace_msg_t *msg = alloc_stack_trace_msg(len); + if (msg == NULL) { + clib_mem_free(trace_str); + return; + } + + set_stack_trace_msg(ctx, msg, v, false, 0, 0, process_name, NULL); + snprintf((char *)&msg->data[0], strlen(trace_str) + 2, "%s", trace_str); + msg->data_len = strlen((char *)msg->data); + kv.msg_ptr = pointer_to_uword(msg); + + if (stack_trace_msg_hash_add_del(msg_hash, + (stack_trace_msg_hash_kv + *) & kv, 1 /* is_add */ )) { + ebpf_warning("%sstack_trace_msg_hash_add_del() failed.\n", + ctx->tag); + clib_mem_free(msg); + } else { + __sync_fetch_and_add(&msg_hash->hash_elems_count, 1); + } +} + +static void aggregate_stack_traces(struct profiler_context *ctx, + struct bpf_tracer *t, + const char *stack_map_name, + stack_str_hash_t * stack_str_hash, + stack_trace_msg_hash_t * msg_hash, + u32 * count, bool use_a_map) +{ + struct stack_trace_key_t *v; + vec_foreach(v, ctx->raw_stack_data) { + if (v == NULL) + break; + + if (unlikely(ctx->profiler_stop == 1)) + break; + + /* + * If cpu_aggregation_flag=0, the CPU value for stack trace data + * reporting is a special value (CPU_INVALID:0xfff) used to indicate + * that it is an invalid value, the CPUID will not be included in + * the aggregation. + */ + if (ctx->cpu_aggregation_flag == 0) + v->cpu = CPU_INVALID; + + /* + * Uniform idle process names to reduce the aggregated count of stack + * trace data (when we aggregate using process names as part of the key). + * "swapper/0", "swapper/1", "swapper/2" ... > "swapper" + */ + if (v->pid == v->tgid && v->pid == 0) { + const char *idle_name = "swapper"; + strcpy_s_inline(v->comm, sizeof(v->comm), + idle_name, strlen(idle_name)); + } + + /* -EEXIST: Hash bucket collision in the stack trace table */ + if (v->kernstack == -EEXIST) + ctx->stack_trace_err++; + + if (v->userstack == -EEXIST) + ctx->stack_trace_err++; + + add_stack_id_to_bitmap(ctx, v->kernstack, use_a_map); + add_stack_id_to_bitmap(ctx, v->userstack, use_a_map); + + /* Total iteration count for this iteration. */ + (*count)++; + + /* Total iteration count for all iterations. */ + ctx->process_count++; + + /* + * Firstly, search the stack-trace-msg hash to see if the + * stack trace messages has already been stored. + */ + stack_trace_msg_kv_t kv; + char name[TASK_COMM_LEN]; + memset(name, 0, sizeof(name)); + u64 stime, netns_id; + stime = netns_id = 0; + void *info_p = NULL; + struct symbolizer_proc_info *__info_p = NULL; + char *process_name = NULL; + bool matched, is_match_finish; + matched = is_match_finish = false; + + /* If it is a process, match operation will be performed immediately. */ + if (v->pid == v->tgid) { + is_match_finish = true; + profile_regex_lock(ctx); + matched = + (regexec(&ctx->profiler_regex, v->comm, 0, NULL, 0) + == 0); + profile_regex_unlock(ctx); + if (!matched) { + if (ctx->only_matched_data) { + continue; + } + + set_msg_kvp_by_comm(&kv, v, (void *)0); + goto skip_proc_find; + } + } + + get_process_info_by_pid(v->tgid, &stime, &netns_id, + (char *)name, &info_p); + __info_p = info_p; + + /* + * If the data collected is from a running process, and the process + * name and the command name of the task (captured by eBPF) are not + * consistent, it indicates that the cached process information is + * no longer valid. + */ + if (stime > 0 && v->pid == v->tgid && strcmp(name, v->comm)) { + stime = netns_id = 0; + name[0] = '\0'; + process_name = NULL; + info_p = NULL; + } + + if (stime > 0) { + if (v->tgid == 0) + process_name = v->comm; + else + process_name = name; + + if (!is_match_finish) { + profile_regex_lock(ctx); + matched = + (regexec + (&ctx->profiler_regex, process_name, 0, + NULL, 0) + == 0); + profile_regex_unlock(ctx); + } + + if (matched) + set_msg_kvp(&kv, v, stime, (void *)0); + else { + if (ctx->only_matched_data) { + if (__info_p + && AO_SUB_F(&__info_p->use, 1) == 0) + free_proc_cache(__info_p); + + continue; + } + + set_msg_kvp_by_comm(&kv, v, (void *)0); + } + } else { + if (ctx->only_matched_data) { + if (__info_p + && AO_SUB_F(&__info_p->use, 1) == 0) + free_proc_cache(__info_p); + continue; + } + + /* Not find process in procfs. */ + set_msg_kvp_by_comm(&kv, v, (void *)0); + } + + /* + * Here, we duplicate the matched process data and place it into + * the Total process, with the aim of showcasing the proportion + * of each process in the overall sampling. + */ + if (matched && !ctx->only_matched_data) + update_matched_process_in_total(ctx, msg_hash, + process_name, v); + + skip_proc_find: + if (stack_trace_msg_hash_search + (msg_hash, (stack_trace_msg_hash_kv *) & kv, + (stack_trace_msg_hash_kv *) & kv) == 0) { + __sync_fetch_and_add(&msg_hash->hit_hash_count, 1); + ((stack_trace_msg_t *) kv.msg_ptr)->count += + v->duration_ns; + if (__info_p && AO_SUB_F(&__info_p->use, 1) == 0) + free_proc_cache(__info_p); + continue; + } + + /* + * Folded stack trace string and generate stack trace messages. + * + * Folded stack trace string (taken from a performance profiler test): + * main;xxx();yyy() + * It is a list of symbols corresponding to addresses in the underlying + * stack trace, separated by ';'. + */ + + char *trace_str = + resolve_and_gen_stack_trace_str(t, v, stack_map_name, + stack_str_hash, matched, + process_name, info_p); + + if (trace_str) { + /* + * append process/thread name to stack string + * append 2 byte for ';''\0' + * append pre_tag '[p/t]' + */ + char pre_tag[5]; + int str_len = strlen(trace_str) + 2; + if (matched) + str_len += strlen(v->comm) + sizeof(pre_tag); + + int len = sizeof(stack_trace_msg_t) + str_len; + stack_trace_msg_t *msg = alloc_stack_trace_msg(len); + if (msg == NULL) { + clib_mem_free(trace_str); + if (__info_p + && AO_SUB_F(&__info_p->use, 1) == 0) + free_proc_cache(__info_p); + + continue; + } + + memset(msg, 0, len); + struct symbolizer_proc_info *__p = info_p; + set_stack_trace_msg(ctx, msg, v, matched, stime, + netns_id, process_name, + __p ? __p->container_id : NULL); + + snprintf(pre_tag, sizeof(pre_tag), "%s ", + v->pid == v->tgid ? "[p]" : "[t]"); + if (matched) + snprintf((char *)&msg->data[0], str_len, + "%s%s;%s", pre_tag, v->comm, + trace_str); + else + snprintf((char *)&msg->data[0], str_len, "%s", + trace_str); + + msg->data_len = strlen((char *)msg->data); + clib_mem_free(trace_str); + kv.msg_ptr = pointer_to_uword(msg); + + if (stack_trace_msg_hash_add_del(msg_hash, + (stack_trace_msg_hash_kv + *) & kv, + 1 /* is_add */ )) { + ebpf_warning + ("%sstack_trace_msg_hash_add_del() failed.\n", + ctx->tag); + clib_mem_free(msg); + } else { + __sync_fetch_and_add + (&msg_hash->hash_elems_count, 1); + } + } + + if (__info_p && AO_SUB_F(&__info_p->use, 1) == 0) + free_proc_cache(__info_p); + + /* check and clean symbol cache */ + exec_proc_info_cache_update(); + } + + vec_free(ctx->raw_stack_data); +} + +void process_bpf_stacktraces(struct profiler_context *ctx, struct bpf_tracer *t) +{ + struct bpf_perf_reader *r; + const char *stack_map_name; + bool using_map_set_a = (ctx->transfer_count % 2 == 0); + r = using_map_set_a ? ctx->r_a : ctx->r_b; + stack_map_name = + using_map_set_a ? ctx->stack_map_name_a : ctx->stack_map_name_b; + const u64 sample_count_idx = + using_map_set_a ? SAMPLE_CNT_A_IDX : SAMPLE_CNT_B_IDX; + + struct epoll_event events[r->readers_count]; + int nfds = reader_epoll_wait(r, events, 0); + + ctx->transfer_count++; + if (bpf_table_set_value(t, ctx->state_map_name, + TRANSFER_CNT_IDX, + &ctx->transfer_count) == false) { + ebpf_warning("%sprofiler state map update error." + "(%s transfer_count %lu) - %s\n", + ctx->tag, ctx->state_map_name, ctx->transfer_count, + strerror(errno)); + ctx->transfer_count--; + } + + /* Total iteration count for this iteration. */ + u32 count = 0; + + /* eBPF map record count for this iteration. */ + u64 sample_cnt_val = 0; + + /* + * Why use g_stack_str_hash? + * + * When the stringizer encounters a stack-ID for the first time in + * the stack trace table, it clears it. If a stack-ID is reused by + * different stack trace keys, the stringizer returns its memoized + * stack trace string. Since stack IDs are unstable between profile + * iterations, we create and destroy the stringizer in each profile + * iteration. + */ + if (unlikely(ctx->stack_str_hash.buckets == NULL)) { + if (init_stack_str_hash + (&ctx->stack_str_hash, "profile_stack_str")) { + ebpf_warning("%sinit_stack_str_hash() failed.\n", + ctx->tag); + return; + } + } + + /* + * During each transmission iteration, we have a hashmap structure in + * place for the following purposes: + * + * 1 Pushing the data of this iteration to the higher-level processing. + * 2 Performing data statistics based on the stack trace data, using the + * combination of "tgid + tgid_start_time + pid + cpu + k_stack_id + + * u_stack_id + " as the key. + * + * Here is the key-value pair structure of the hashmap: + * see perf_profiler.h (stack_trace_msg_kv_t) + * This is the final form of the data. If the current stack trace message + * is a match, we only need to increment the count field in the correspon- + * ding value, thus avoiding duplicate parsing. + */ + if (unlikely(ctx->msg_hash.buckets == NULL)) { + if (init_stack_trace_msg_hash + (&ctx->msg_hash, "stack_trace_msg")) { + ebpf_warning("%sinit_stack_trace_msg_hash() failed.\n", + ctx->tag); + return; + } + } + + if (nfds > 0) { + + check_again: + if (unlikely(ctx->profiler_stop == 1)) + goto release_iter; + + /* + * If there is data, the reader's callback + * function will be called. + */ + reader_event_read(events, nfds); + + /* + * After the reader completes data reading, the work of + * data aggregation will be blocked if there is no data. + */ + aggregate_stack_traces(ctx, t, stack_map_name, + &ctx->stack_str_hash, &ctx->msg_hash, + &count, using_map_set_a); + + /* + * To ensure that all data in the perf ring-buffer is procenssed + * in this iteration, as this iteration will clean up all the + * data recorded in the stackmap, any residual data in the perf + * ring-buffer will be carried over to the next iteration for + * processing. This poses a risk of not being able to find the + * corresponding stackmap records in the next iteration, leading + * to incomplete processing. + */ + if (bpf_table_get_value(t, ctx->state_map_name, + sample_count_idx, + (void *)&sample_cnt_val)) { + if (sample_cnt_val > count) { + nfds = reader_epoll_short_wait(r, events, 0); + if (nfds > 0) + goto check_again; + } + } + } + +release_iter: + + cleanup_stackmap(ctx, t, stack_map_name, using_map_set_a); + + /* Now that we've consumed the data, reset the sample count in BPF. */ + sample_cnt_val = 0; + bpf_table_set_value(t, ctx->state_map_name, + sample_count_idx, &sample_cnt_val); + + print_profiler_status(ctx, t, count); + + /* free all elems */ + clean_stack_strs(&ctx->stack_str_hash); + + /* Push messages and free stack_trace_msg_hash */ + push_and_release_stack_trace_msg(ctx, &ctx->msg_hash, false); +} diff --git a/agent/src/ebpf/user/profile/profile_common.h b/agent/src/ebpf/user/profile/profile_common.h new file mode 100644 index 00000000000..96586dc393d --- /dev/null +++ b/agent/src/ebpf/user/profile/profile_common.h @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2024 Yunshan Networks + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DF_USER_PROFILE_COMMON_H +#define DF_USER_PROFILE_COMMON_H + +struct profiler_context { + // log output flag string + const char *tag; + // Profiler type + u8 type; + // The name of the status map + char state_map_name[MAP_NAME_SZ]; + // The dual-buffered reader is used to read data from the perf buffer. + struct bpf_perf_reader *r_a; + struct bpf_perf_reader *r_b; + // stack map name + char stack_map_name_a[MAP_NAME_SZ]; + char stack_map_name_b[MAP_NAME_SZ]; + + // Read raw data from the eBPF perfbuf and temporarily store it. + struct stack_trace_key_t *raw_stack_data; + + // Cache hash: obtain folded stack trace string from stack ID. + stack_str_hash_t stack_str_hash; + // Used for tracking data statistics and pushing. + stack_trace_msg_hash_t msg_hash; + + /* + * 'cpu_aggregation_flag' is used to set whether to retrieve CPUID + * and include it in the aggregation of stack trace data. + * + * If valude is set to 1, CPUID will be retrieved and included in + * the aggregation of stack trace data. If value is set to 0, + * CPUID will not be retrieved and will not be included in the + * aggregation. Any other value is considered invalid. + */ + volatile u64 cpu_aggregation_flag; + + /* + * To perform regular expression matching on process names, + * the 'profiler_regex' is set using the 'set_profiler_regex()' + * interface. Processes that successfully match the regular + * expression are aggregated using the key: + * `{pid + stime + u_stack_id + k_stack_id + tid + cpu}`. + * + * For processes that do not match, they are aggregated using the + * key: + * ``. + */ + regex_t profiler_regex; + bool regex_existed; + volatile u32 regex_lock; + + /* + * The profiler stop flag, with 1 indicating stop and + * 0 indicating running status. + */ + volatile u64 profiler_stop; + + /* + * This flag is used to enable the eBPF program to start working. + * with 1 indicating enable and 0 indicating disable. + */ + volatile u64 enable_bpf_profile; + + /* + * The identifier to only retrieve matched data. This flag + * setting will exclude the total process. + */ + bool only_matched_data; + + /* + * In the sampling scenario, the number of samples is used; in the + * non-sampling scenario, real-time intervals (in nanoseconds) are + * used. This setting determines whether to use time intervals, + * with a default value of false. + */ + bool use_delta_time; + + // Record all stack IDs in each iteration for quick retrieval. + struct stack_ids_bitmap stack_ids_a; + struct stack_ids_bitmap stack_ids_b; + // This vector table is used to remove a stack from the stack map. + int *clear_stack_ids_a; + int *clear_stack_ids_b; + + // for stack_trace_msg_hash relese + stack_trace_msg_hash_kv *trace_msg_kvps; + bool msg_clear_hash; + + /* profiler statistics */ + + // Switching between dual buffers. + u64 transfer_count; + // Total iteration count for all iterations. + u64 process_count; + u64 stackmap_clear_failed_count; + // perf buffer queue loss statistics. + u64 perf_buf_lost_a_count; + u64 perf_buf_lost_b_count; + /* + * During the parsing process, it is possible for processes in procfs + * to be missing (processes that start and exit quickly). This variable + * is used to count the number of lost processes during the parsing process. + */ + atomic64_t process_lost_count; + // Stack error quantity statistics obtained by eBPF. + u64 stack_trace_err; + // Quantity statistics of data pushed. + u64 push_count; + + /* + * Record the time of the last data push + * (in seconds since system startup) + */ + u64 last_push_time; +}; + +static inline void profile_regex_lock(struct profiler_context *c) +{ + while (__atomic_test_and_set(&c->regex_lock, __ATOMIC_ACQUIRE)) + CLIB_PAUSE(); +} + +static inline void profile_regex_unlock(struct profiler_context *c) +{ + __atomic_clear(&c->regex_lock, __ATOMIC_RELEASE); +} + +void process_bpf_stacktraces(struct profiler_context *ctx, + struct bpf_tracer *t); +int do_profiler_regex_config(const char *pattern, struct profiler_context *ctx); +void set_bpf_run_enabled(struct bpf_tracer *t, struct profiler_context *ctx, + u64 enable_flag); +int profiler_context_init(struct profiler_context *ctx, + const char *tag, + u8 type, + bool enable_profiler, + const char *state_map_name, + const char *stack_map_name_a, + const char *stack_map_name_b, + bool only_matched, bool use_delta_time); +bool run_conditions_check(void); +int java_libs_and_tools_install(void); +void push_and_release_stack_trace_msg(struct profiler_context *ctx, + stack_trace_msg_hash_t * h, + bool is_force); +#endif /*DF_USER_PROFILE_COMMON_H */ diff --git a/agent/src/ebpf/user/profile/stringifier.c b/agent/src/ebpf/user/profile/stringifier.c index 2ec2cffde33..3e1532e403b 100644 --- a/agent/src/ebpf/user/profile/stringifier.c +++ b/agent/src/ebpf/user/profile/stringifier.c @@ -78,7 +78,7 @@ static const char *u_sym_prefix = ""; * in the loss of stack data. This situation is rare and difficult * to occur. */ -static u64 stack_table_data_miss; +static __thread u64 stack_table_data_miss; u64 get_stack_table_data_miss_count(void) { @@ -159,14 +159,29 @@ static inline char *create_symbol_str(int len, char *src, const char *tag) } static inline int symcache_resolve(pid_t pid, void *resolver, u64 address, - struct bcc_symbol *sym) + struct bcc_symbol *sym, void *info_p) { ASSERT(pid >= 0); - if (pid == 0) - return bcc_symcache_resolve_no_demangle(resolver, address, sym); - else - return bcc_symcache_resolve(resolver, address, sym); + int ret = -1; + if (pid == 0) { + symbolizer_kernel_lock(); + ret = bcc_symcache_resolve_no_demangle(resolver, address, sym); + symbolizer_kernel_unlock(); + } else { + struct symbolizer_proc_info *p = info_p; + if (p) { + symbolizer_proc_lock(p); + if ((u64) resolver != (u64) p->syms_cache) { + symbolizer_proc_unlock(p); + return (-1); + } + ret = bcc_symcache_resolve(resolver, address, sym); + symbolizer_proc_unlock(p); + } + } + + return ret; } static char *symbol_name_fetch(pid_t pid, struct bcc_symbol *sym) @@ -205,11 +220,12 @@ static char *resolve_addr(struct bpf_tracer *t, pid_t pid, bool is_start_idx, char *ptr = NULL; char format_str[32]; struct bcc_symbol sym; + memset(&sym, 0, sizeof(sym)); void *resolver = get_symbol_cache(pid, is_create); if (resolver == NULL) goto resolver_err; - int ret = symcache_resolve(pid, resolver, address, &sym); + int ret = symcache_resolve(pid, resolver, address, &sym, info_p); if (ret == 0) { ptr = symbol_name_fetch(pid, &sym); if (ptr) { @@ -241,9 +257,11 @@ static char *resolve_addr(struct bpf_tracer *t, pid_t pid, bool is_start_idx, ptr = create_symbol_str(len, format_str, ""); if (info_p) { struct symbolizer_proc_info *p = info_p; + symbolizer_proc_lock(p); if (p->is_java && strstr(format_str, "perf-")) { p->unknown_syms_found = true; } + symbolizer_proc_unlock(p); } goto finish; } diff --git a/agent/src/ebpf/user/symbol.c b/agent/src/ebpf/user/symbol.c index 978ab28db9c..9f75e847d9d 100644 --- a/agent/src/ebpf/user/symbol.c +++ b/agent/src/ebpf/user/symbol.c @@ -489,36 +489,59 @@ uint64_t get_symbol_addr_from_binary(const char *bin, const char *symname) #ifndef AARCH64_MUSL static symbol_caches_hash_t syms_cache_hash; // for user process symbol caches static void *k_resolver; // for kernel symbol cache +static volatile u32 k_resolver_lock; static u64 sys_btime_msecs; // system boot time(milliseconds) +void symbolizer_kernel_lock(void) +{ + while (__atomic_test_and_set(&k_resolver_lock, __ATOMIC_ACQUIRE)) + CLIB_PAUSE(); +} + +void symbolizer_kernel_unlock(void) +{ + __atomic_clear(&k_resolver_lock, __ATOMIC_RELEASE); +} + static bool inline enable_proc_info_cache(void) { return (syms_cache_hash.buckets != NULL); } -static inline void free_symbolizer_cache_kvp(struct symbolizer_cache_kvp *kv) +void free_proc_cache(struct symbolizer_proc_info *p) { - if (kv->v.cache) { - bcc_free_symcache((void *)kv->v.cache, kv->k.pid); + symbolizer_proc_lock(p); + if (p->is_java) { + /* Delete target ns Java files */ + int pid = (int)p->pid; + if (pid > 0) { + clean_local_java_symbols_files(pid); + } + } + + if (p->syms_cache) { + bcc_free_symcache((void *)p->syms_cache, p->pid); free_symcache_count++; - kv->v.cache = 0; } + p->syms_cache = 0; + symbolizer_proc_unlock(p); + clib_mem_free((void *)p); +} +static void free_symbolizer_cache_kvp(struct symbolizer_cache_kvp *kv) +{ if (kv->v.proc_info_p) { struct symbolizer_proc_info *p; p = (struct symbolizer_proc_info *)kv->v.proc_info_p; - if (p->is_java) { - /* Delete target ns Java files */ - int pid = (int)kv->k.pid; - if (pid > 0) { - clean_local_java_symbols_files(pid); - } - } - /* Ensure that all tasks are completed before releasing. */ if (AO_SUB_F(&p->use, 1) == 0) { - clib_mem_free((void *)p); + if (kv->v.cache != p->syms_cache) + ebpf_warning + ("kv->v.cache 0x%lx p->syms_cache 0x%lx\n", + kv->v.cache, p->syms_cache); + free_proc_cache(p); kv->v.proc_info_p = 0; + kv->v.cache = 0; } } } @@ -574,10 +597,14 @@ static inline struct symbolizer_proc_info *add_proc_info_to_cache(struct kv->v.proc_info_p = pointer_to_uword(p); kv->v.cache = 0; - if (symbol_caches_hash_add_del - (h, (symbol_caches_hash_kv *) kv, 1 /* is_add */ )) { + int ret; + if ((ret = symbol_caches_hash_add_del + (h, (symbol_caches_hash_kv *) kv, + 2 /* is_add = 2, Add but do not overwrite? */ )) != 0) { + // If it already exists, return -2. ebpf_warning - ("symbol_caches_hash_add_del() failed.(pid %d)\n", pid); + ("symbol_caches_hash_add_del() failed.(pid %d), return %d\n", + pid, ret); free_symbolizer_cache_kvp(kv); return NULL; } else @@ -589,7 +616,6 @@ static inline struct symbolizer_proc_info *add_proc_info_to_cache(struct static inline int del_proc_info_from_cache(struct symbolizer_cache_kvp *kv) { symbol_caches_hash_t *h = &syms_cache_hash; - free_symbolizer_cache_kvp(kv); if (symbol_caches_hash_add_del(h, (symbol_caches_hash_kv *) kv, 0 /* delete */ )) { ebpf_warning @@ -599,7 +625,7 @@ static inline int del_proc_info_from_cache(struct symbolizer_cache_kvp *kv) } else { __sync_fetch_and_add(&h->hash_elems_count, -1); } - + free_symbolizer_cache_kvp(kv); return 0; } @@ -770,6 +796,8 @@ static int config_symbolizer_proc_info(struct symbolizer_proc_info *p, int pid) p->new_java_syms_file = false; p->cache_need_update = false; p->gen_java_syms_file_err = false; + p->lock = 0; + p->syms_cache = 0; p->netns_id = get_netns_id_from_pid(pid); if (p->netns_id == 0) return ETR_INVAL; @@ -800,6 +828,18 @@ static int config_symbolizer_proc_info(struct symbolizer_proc_info *p, int pid) return ETR_OK; } +static inline void write_return_value(struct symbolizer_cache_kvp *kv, + struct symbolizer_proc_info *p, + u64 * stime, u64 * netns_id, char *name, + void **ptr) +{ + copy_process_name(kv, name); + *stime = cache_process_stime(kv); + *netns_id = cache_process_netns_id(kv); + *ptr = p; + AO_INC(&p->use); +} + void get_process_info_by_pid(pid_t pid, u64 * stime, u64 * netns_id, char *name, void **ptr) { @@ -824,13 +864,21 @@ void get_process_info_by_pid(pid_t pid, u64 * stime, u64 * netns_id, char *name, p = add_proc_info_to_cache(&kv); if (p == NULL) return; + symbolizer_proc_lock(p); + write_return_value(&kv, p, stime, netns_id, name, ptr); + symbolizer_proc_unlock(p); + } else { p = (struct symbolizer_proc_info *)kv.v.proc_info_p; + symbolizer_proc_lock(p); u64 curr_time = current_sys_time_secs(); if (!p->verified) { if (((curr_time - (p->stime / 1000ULL)) < PROC_INFO_VERIFY_TIME)) { - goto fetch_proc_info; + write_return_value(&kv, p, stime, netns_id, + name, ptr); + symbolizer_proc_unlock(p); + return; } /* @@ -840,21 +888,24 @@ void get_process_info_by_pid(pid_t pid, u64 * stime, u64 * netns_id, char *name, * for a period of time to avoid such situations. */ char comm[sizeof(p->comm)]; - u64 stime = (u64) + u64 proc_stime = (u64) get_process_starttime_and_comm(pid, comm, sizeof(comm)); - if (stime == 0) { + if (proc_stime == 0) { /* * Here, indicate that during the symbolization process, * the process has already terminated, but the process * information has not yet been cleared. In this case, we * continue to use the previously retained information. */ - goto fetch_proc_info; + write_return_value(&kv, p, stime, netns_id, + name, ptr); + symbolizer_proc_unlock(p); + return; } - p->stime = stime; + p->stime = proc_stime; memcpy(p->comm, comm, sizeof(p->comm)); p->comm[sizeof(p->comm) - 1] = '\0'; @@ -866,13 +917,9 @@ void get_process_info_by_pid(pid_t pid, u64 * stime, u64 * netns_id, char *name, p->verified = true; } + write_return_value(&kv, p, stime, netns_id, name, ptr); + symbolizer_proc_unlock(p); } - -fetch_proc_info: - copy_process_name(&kv, name); - *stime = cache_process_stime(&kv); - *netns_id = cache_process_netns_id(&kv); - *ptr = p; } static void *symbols_cache_update(symbol_caches_hash_t * h, @@ -911,6 +958,7 @@ static void *symbols_cache_update(symbol_caches_hash_t * h, p->new_java_syms_file = false; p->add_task_list = false; p->cache_need_update = false; + p->syms_cache = kv->v.cache; return (void *)kv->v.cache; } @@ -918,6 +966,8 @@ static inline void java_expired_update(symbol_caches_hash_t * h, struct symbolizer_cache_kvp *kv, struct symbolizer_proc_info *p) { + ASSERT(p != NULL); + /* Update java symbols table, will be executed during * the next Java symbolication */ @@ -948,8 +998,12 @@ void *get_symbol_cache(pid_t pid, bool new_cache) ASSERT(pid >= 0); if (k_resolver == NULL && pid == 0) { - k_resolver = (void *)bcc_symcache_new(-1, &lazy_opt); - sys_btime_msecs = get_sys_btime_msecs(); + symbolizer_kernel_lock(); + if (k_resolver == NULL) { + k_resolver = (void *)bcc_symcache_new(-1, &lazy_opt); + sys_btime_msecs = get_sys_btime_msecs(); + } + symbolizer_kernel_unlock(); } if (!new_cache) @@ -967,6 +1021,7 @@ void *get_symbol_cache(pid_t pid, bool new_cache) if (symbol_caches_hash_search(h, (symbol_caches_hash_kv *) & kv, (symbol_caches_hash_kv *) & kv) == 0) { p = (struct symbolizer_proc_info *)kv.v.proc_info_p; + symbolizer_proc_lock(p); u64 curr_time = current_sys_time_secs(); if (p->verified) { /* @@ -998,7 +1053,8 @@ void *get_symbol_cache(pid_t pid, bool new_cache) * generation of Java symbol tables, additional random value * for each java process's delay. */ - if (kv.v.cache == 0 && !p->gen_java_syms_file_err) { + if (kv.v.cache == 0 + && !p->gen_java_syms_file_err) { p->update_syms_table_time = generate_random_integer (PROFILER_DEFER_RANDOM_MAX); @@ -1009,18 +1065,27 @@ void *get_symbol_cache(pid_t pid, bool new_cache) && curr_time >= p->update_syms_table_time) { if (p->is_java) { java_expired_update(h, &kv, p); + symbolizer_proc_unlock(p); return (void *)kv.v.cache; } else { - return symbols_cache_update(h, &kv, p); + void *ret = + symbols_cache_update(h, &kv, p); + symbolizer_proc_unlock(p); + return ret; } } } else { + symbolizer_proc_unlock(p); /* Ensure that newly launched JAVA processes are detected. */ return NULL; } - if (kv.v.cache) + if (kv.v.cache) { + symbolizer_proc_unlock(p); return (void *)kv.v.cache; + } + + symbolizer_proc_unlock(p); } return NULL; diff --git a/agent/src/ebpf/user/symbol.h b/agent/src/ebpf/user/symbol.h index 9763a694aeb..15413baff5d 100644 --- a/agent/src/ebpf/user/symbol.h +++ b/agent/src/ebpf/user/symbol.h @@ -82,8 +82,23 @@ struct symbolizer_proc_info { char container_id[CONTAINER_ID_SIZE]; /* reference counting */ u64 use; + /* Protect symbolizer_proc_info from concurrent access by multiple threads. */ + volatile u32 lock; + /* Recording symbol resolution cache. */ + volatile uword syms_cache; }; +static inline void symbolizer_proc_lock(struct symbolizer_proc_info *p) +{ + while (__atomic_test_and_set(&p->lock, __ATOMIC_ACQUIRE)) + CLIB_PAUSE(); +} + +static inline void symbolizer_proc_unlock(struct symbolizer_proc_info *p) +{ + __atomic_clear(&p->lock, __ATOMIC_RELEASE); +} + struct symbolizer_cache_kvp { struct { u64 pid; @@ -155,27 +170,28 @@ struct symbol_tracepoint { char *name; }; -static_always_inline u64 -cache_process_stime(struct symbolizer_cache_kvp *kv) +static_always_inline u64 cache_process_stime(struct symbolizer_cache_kvp *kv) { - return (u64)((struct symbolizer_proc_info *)kv->v.proc_info_p)->stime; + return (u64) ((struct symbolizer_proc_info *)kv->v.proc_info_p)->stime; } -static_always_inline u64 -cache_process_netns_id(struct symbolizer_cache_kvp *kv) +static_always_inline u64 cache_process_netns_id(struct symbolizer_cache_kvp *kv) { - return (u64)((struct symbolizer_proc_info *)kv->v.proc_info_p)->netns_id; + return (u64) ((struct symbolizer_proc_info *)kv->v. + proc_info_p)->netns_id; } static_always_inline void copy_process_name(struct symbolizer_cache_kvp *kv, char *dst) { static const int len = - sizeof(((struct symbolizer_proc_info *)kv->v.proc_info_p)->comm); + sizeof(((struct symbolizer_proc_info *) kv->v.proc_info_p)->comm); strcpy_s_inline(dst, len, - ((struct symbolizer_proc_info *)kv->v.proc_info_p)->comm, - strlen(((struct symbolizer_proc_info *)kv->v.proc_info_p)->comm)); + ((struct symbolizer_proc_info *)kv->v. + proc_info_p)->comm, + strlen(((struct symbolizer_proc_info *)kv-> + v.proc_info_p)->comm)); } void free_uprobe_symbol(struct symbol_uprobe *u_sym, @@ -198,6 +214,9 @@ u64 get_pid_stime(pid_t pid); void exec_proc_info_cache_update(void); void set_java_syms_fetch_delay(int delay_secs); u64 get_java_syms_fetch_delay(void); +void free_proc_cache(struct symbolizer_proc_info *p); +void symbolizer_kernel_lock(void); +void symbolizer_kernel_unlock(void); #endif int create_and_init_proc_info_caches(void); void get_container_id_from_procs_cache(pid_t pid, uint8_t * id, int id_size); diff --git a/agent/src/ebpf/user/tracer.c b/agent/src/ebpf/user/tracer.c index 3e0fc0637b1..46db256ed8d 100644 --- a/agent/src/ebpf/user/tracer.c +++ b/agent/src/ebpf/user/tracer.c @@ -218,13 +218,6 @@ int release_bpf_tracer(const char *name) if (t->release_cb != NULL) t->release_cb(t); - /* - * Check if the reader thread has exited before releasing - * the reader lock. - */ - while (t->perf_worker[0] != 0) - sleep(1); - if (t->lock) { clib_mem_free((void *)t->lock); t->lock = NULL; @@ -253,8 +246,8 @@ int enable_tracer_reader_work(const char *prefix_name, int idx, int ret; char name[TASK_COMM_LEN]; snprintf(name, sizeof(name), "%s-%d", prefix_name, idx); - ret = pthread_create(&tracer->perf_worker[idx], NULL, fn, - (void *)(uint64_t)idx); + ret = pthread_create(&tracer->perf_workers[idx], NULL, fn, + (void *)(uint64_t) idx); if (ret) { ebpf_warning("tracer reader(%s), pthread_create " "is error:%s\n", name, strerror(errno)); @@ -262,14 +255,14 @@ int enable_tracer_reader_work(const char *prefix_name, int idx, } /* set thread name */ - pthread_setname_np(tracer->perf_worker[idx], name); + pthread_setname_np(tracer->perf_workers[idx], name); /* * Separating threads is to automatically release * resources after pthread_exit(), without being * blocked or stuck. */ - ret = pthread_detach(tracer->perf_worker[idx]); + ret = pthread_detach(tracer->perf_workers[idx]); if (ret != 0) { ebpf_warning("Error detaching thread, error:%s\n", strerror(errno)); @@ -346,6 +339,7 @@ struct bpf_tracer *setup_bpf_tracer(const char *name, bt->release_cb = free_cb; bt->create_cb = create_cb; bt->sample_freq = sample_freq; + bt->enable_sample = false; bt->dispatch_workers_nr = workers_nr; bt->process_fn = handle; @@ -871,8 +865,8 @@ static int tracepoint_detach(struct tracepoint *tp) int tracer_hooks_process(struct bpf_tracer *tracer, enum tracer_hook_type type, int *probes_count) { - int (*probe_fun) (struct probe * p) = NULL; - int (*tracepoint_fun) (struct tracepoint * p) = NULL; + int (*probe_fun)(struct probe * p) = NULL; + int (*tracepoint_fun)(struct tracepoint * p) = NULL; if (type == HOOK_ATTACH) { probe_fun = probe_attach; tracepoint_fun = tracepoint_attach; @@ -959,6 +953,10 @@ int tracer_hooks_process(struct bpf_tracer *tracer, enum tracer_hook_type type, } perf_event: + + if (!tracer->enable_sample) + return ETR_OK; + /* * perf event */ @@ -1010,8 +1008,8 @@ int tracer_hooks_process(struct bpf_tracer *tracer, enum tracer_hook_type type, errno = 0; int ret = program__detach_perf_event(tracer->per_cpu_fds, - ARRAY_SIZE(tracer-> - per_cpu_fds)); + ARRAY_SIZE + (tracer->per_cpu_fds)); if (!ret) { ebpf_info ("tracer \"%s\" detach perf event prog successful.\n", @@ -1169,7 +1167,7 @@ static int perf_reader_setup(struct bpf_perf_reader *perf_reader, int thread_nr) spread_id = 0; struct reader_forward_info *fwd_info = - malloc(sizeof(struct reader_forward_info)); + malloc(sizeof(struct reader_forward_info)); if (fwd_info == NULL) { ebpf_error("reader_forward_info malloc() failed.\n"); return ETR_NOMEM; @@ -1181,12 +1179,10 @@ static int perf_reader_setup(struct bpf_perf_reader *perf_reader, int thread_nr) ebpf_info("Perf buffer reader cpu(%d) -> queue(%d)\n", fwd_info->cpu_id, fwd_info->queue_id); - reader = - (struct perf_reader *) + reader = (struct perf_reader *) bpf_open_perf_buffer(perf_reader->raw_cb, perf_reader->lost_cb, - (void *)fwd_info, -1, i, - pages_cnt); + (void *)fwd_info, -1, i, pages_cnt); if (reader == NULL) { ebpf_error("bpf_open_perf_buffer() failed.\n"); return ETR_NORESOURCE; @@ -1436,7 +1432,7 @@ static int tracer_sockopt_set(sockoptid_t opt, const void *conf, size_t size) } static int tracer_sockopt_get(sockoptid_t opt, const void *conf, size_t size, - void **out, size_t * outsize) + void **out, size_t *outsize) { *outsize = sizeof(struct bpf_tracer_param_array) + sizeof(struct bpf_tracer_param) * tracers_count; diff --git a/agent/src/ebpf/user/tracer.h b/agent/src/ebpf/user/tracer.h index 3a1eff79ef8..411795ca0eb 100644 --- a/agent/src/ebpf/user/tracer.h +++ b/agent/src/ebpf/user/tracer.h @@ -347,11 +347,12 @@ struct bpf_tracer { */ int per_cpu_fds[MAX_CPU_NR]; int sample_freq; // sample frequency, Hertz. + bool enable_sample; // Enable CPU sampling? /* * Data distribution processing worker, queues */ - pthread_t perf_worker[MAX_CPU_NR]; // Main thread for user-space receiving perf-buffer data + pthread_t perf_workers[MAX_CPU_NR]; // Main thread for user-space receiving perf-buffer data pthread_t dispatch_workers[MAX_CPU_NR]; // Dispatch threads int dispatch_workers_nr; // Number of dispatch threads struct queue queues[MAX_CPU_NR]; // Dispatch queues, each dispatch thread has its corresponding queue. diff --git a/server/agent_config/example.yaml b/server/agent_config/example.yaml index 7eca5f3d362..196dd40f86c 100644 --- a/server/agent_config/example.yaml +++ b/server/agent_config/example.yaml @@ -1310,6 +1310,15 @@ vtap_group_id: g-xxxxxx ## Note: Only collect IO events with delay exceeding this threshold, the default value is 1ms. #io-event-minimal-duration: 1ms + ## Java compliant update latency time + ## Default: 600s. Range: [5, 3600]s + ## Note: + ## When deepflow-agent finds that an unresolved function name appears in the function call stack + ## of a Java process, it will trigger the regeneration of the symbol file of the process. + ## Because Java utilizes the Just-In-Time (JIT) compilation mechanism, to obtain more symbols for + ## Java processes, the regeneration will be deferred for a period of time. + #java-symbol-file-refresh-defer-interval: 600s + ## on-cpu profile configuration #on-cpu-profile: ## eBPF on-cpu Profile Switch @@ -1334,13 +1343,27 @@ vtap_group_id: g-xxxxxx ## Default: ^deepflow-.* #regex: ^deepflow-.* - ## When deepflow-agent finds that an unresolved function name appears in the function call stack - ## of a Java process, it will trigger the regeneration of the symbol file of the process. - ## Because Java utilizes the Just-In-Time (JIT) compilation mechanism, to obtain more symbols for - ## Java processes, the regeneration will be deferred for a period of time. - ## Default: 600s. Range: [5, 3600]s - ## The unit of measurement used is seconds. - #java-symbol-file-refresh-defer-interval: 600s + ## off-cpu profile configuration + #off-cpu-profile: + ## eBPF off-cpu Profile Switch + ## Default: true + #disabled: true + + ## Sampling process name + ## Default: ^deepflow-.* + #regex: ^deepflow-.* + + ## Configure the minimum blocking event time + ## Default: 1us. Range: [1, 2^32-1)us + ## Note: + ## Scheduler events are still high-frequency events, as their rate may exceed 1 million events + ## per second, so caution should still be exercised. + ## If overhead remains an issue, you can configure the 'minblock' tunable parameter here. + ## If the off-CPU time is less than the value configured in this item, the data will be discarded. + ## If your goal is to trace longer blocking events, increasing this parameter can filter out shorter + ## blocking events, further reducing overhead. Additionally, we will not collect events with a block + ## time exceeding 1 hour. + #minblock: 1us ###################################### ## Agent Running in Standalone Mode ##