From 0681e37a264a831136dda500f58e42e7ab988cbc Mon Sep 17 00:00:00 2001
From: Jiping Yin <jiping@yunshan.net>
Date: Thu, 9 May 2024 19:00:33 +0800
Subject: [PATCH] [eBPF] Adjust Profiler

---
 agent/src/ebpf/Makefile                       |    4 +-
 agent/src/ebpf/kernel/include/bpf_base.h      |    1 +
 agent/src/ebpf/kernel/include/perf_profiler.h |    3 +-
 agent/src/ebpf/kernel/perf_profiler.bpf.c     |    2 +
 agent/src/ebpf/mod.rs                         |   14 +
 agent/src/ebpf/tools/code.style               |    6 +
 agent/src/ebpf/user/config.h                  |   46 +-
 .../src/ebpf/user/profile/extended/extended.c |   32 +
 .../src/ebpf/user/profile/extended/extended.h |   20 +
 .../ebpf/user/profile/java/gen_syms_file.c    |   10 +-
 agent/src/ebpf/user/profile/perf_profiler.c   | 1304 +++--------------
 agent/src/ebpf/user/profile/perf_profiler.h   |   46 +-
 agent/src/ebpf/user/profile/profile_common.c  | 1067 ++++++++++++++
 agent/src/ebpf/user/profile/profile_common.h  |  162 ++
 agent/src/ebpf/user/profile/stringifier.c     |   32 +-
 agent/src/ebpf/user/symbol.c                  |  133 +-
 agent/src/ebpf/user/symbol.h                  |   37 +-
 agent/src/ebpf/user/tracer.c                  |   38 +-
 agent/src/ebpf/user/tracer.h                  |    3 +-
 server/agent_config/example.yaml              |   37 +-
 20 files changed, 1769 insertions(+), 1228 deletions(-)
 create mode 100644 agent/src/ebpf/tools/code.style
 create mode 100644 agent/src/ebpf/user/profile/extended/extended.c
 create mode 100644 agent/src/ebpf/user/profile/extended/extended.h
 create mode 100644 agent/src/ebpf/user/profile/profile_common.c
 create mode 100644 agent/src/ebpf/user/profile/profile_common.h

diff --git a/agent/src/ebpf/Makefile b/agent/src/ebpf/Makefile
index 993cd786528..6dace6516cd 100644
--- a/agent/src/ebpf/Makefile
+++ b/agent/src/ebpf/Makefile
@@ -88,6 +88,8 @@ OBJS := user/elf.o \
 	user/mem.o \
 	user/vec.o \
 	user/bihash.o \
+	user/profile/profile_common.o \
+	$(patsubst %.c,%.o,$(wildcard user/profile/extended/*.c)) \
 	user/profile/perf_profiler.o \
 	user/profile/stringifier.o \
 	user/profile/java/df_jattach.o \
@@ -175,7 +177,7 @@ user/perf_profiler_bpf_common.c: tools/bintobuffer kernel/perf_profiler.bpf.c
 
 $(STATIC_OBJDIR) $(SHARED_OBJDIR):
 	$(call msg,MKDIR,$@)
-	$(Q)mkdir -p $@/user/profile/java
+	$(Q)mkdir -p $@/user/profile/{java,extended}
 
 $(STATIC_OBJDIR)/user/socket.o: user/socket.c $(SOCKET_TRACE_ELFS) | $(STATIC_OBJDIR)
 	$(call msg,CC,$@)
diff --git a/agent/src/ebpf/kernel/include/bpf_base.h b/agent/src/ebpf/kernel/include/bpf_base.h
index 7fd9b079170..b233f9422e9 100644
--- a/agent/src/ebpf/kernel/include/bpf_base.h
+++ b/agent/src/ebpf/kernel/include/bpf_base.h
@@ -261,6 +261,7 @@ _Pragma("GCC error \"PT_GO_REGS_PARM\"");
 #define KRETPROG(F) SEC("kretprobe/"__stringify(F)) int kretprobe__##F
 #define KPROG(F) SEC("kprobe/"__stringify(F)) int kprobe__##F
 #define TPPROG(F) SEC("tracepoint/syscalls/"__stringify(F)) int bpf_func_##F
+#define TP_SCHED_PROG(F) SEC("tracepoint/sched/"__stringify(F)) int bpf_func_##F
 
 #ifndef CUR_CPU_IDENTIFIER
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0)
diff --git a/agent/src/ebpf/kernel/include/perf_profiler.h b/agent/src/ebpf/kernel/include/perf_profiler.h
index 242659dfce6..ea2c74dc49b 100644
--- a/agent/src/ebpf/kernel/include/perf_profiler.h
+++ b/agent/src/ebpf/kernel/include/perf_profiler.h
@@ -37,7 +37,7 @@ typedef enum {
 	ERROR_IDX,		/* Count the number of failed push notifications. */
 	ENABLE_IDX,		/* Enable profiler sampling flag.
 				   0: disable sampling; 1: enable sampling. */
-
+	MINBLOCK_TIME_IDX,	/* The minimum blocking time, applied in the profiler extension.*/
 	PROFILER_CNT
 } profiler_idx;
 
@@ -49,6 +49,7 @@ struct stack_trace_key_t {
 	int kernstack;
 	int userstack;
 	__u64 timestamp;
+	__u64 duration_ns;
 };
 
 #endif /* DF_BPF_PERF_PROFILER_H */
diff --git a/agent/src/ebpf/kernel/perf_profiler.bpf.c b/agent/src/ebpf/kernel/perf_profiler.bpf.c
index 0114e600e7a..c64e839bd35 100644
--- a/agent/src/ebpf/kernel/perf_profiler.bpf.c
+++ b/agent/src/ebpf/kernel/perf_profiler.bpf.c
@@ -124,6 +124,7 @@ int bpf_perf_event(struct bpf_perf_event_data *ctx)
 	key.cpu = bpf_get_smp_processor_id();
 	bpf_get_current_comm(&key.comm, sizeof(key.comm));
 	key.timestamp = bpf_ktime_get_ns();
+	key.duration_ns = 1;
 
 	/*
 	 * Note:
@@ -213,3 +214,4 @@ int bpf_perf_event(struct bpf_perf_event_data *ctx)
 
 	return 0;
 }
+
diff --git a/agent/src/ebpf/mod.rs b/agent/src/ebpf/mod.rs
index c6295987d7b..8d3e76c4cdd 100644
--- a/agent/src/ebpf/mod.rs
+++ b/agent/src/ebpf/mod.rs
@@ -153,6 +153,12 @@ pub const EVENT_TYPE_PROC_EXEC: u32 = 1 << 5;
 #[allow(dead_code)]
 pub const EVENT_TYPE_PROC_EXIT: u32 = 1 << 6;
 
+// Profiler types
+#[allow(dead_code)]
+pub const PROFILER_TYPE_UNKNOWN: u8 = 0;
+#[allow(dead_code)]
+pub const PROFILER_TYPE_ONCPU: u8 = 1;
+
 //Process exec/exit events
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -367,6 +373,7 @@ pub struct SK_TRACE_STATS {
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct stack_profile_data {
+    pub profiler_type : u8, // Profiler type, such as 1(PROFILER_TYPE_ONCPU).
     pub timestamp: u64, // Timestamp of the stack trace data(unit: nanoseconds).
     pub pid: u32,       // User-space process-ID.
     /*
@@ -384,6 +391,10 @@ pub struct stack_profile_data {
      * The profiler captures the number of occurrences of the same
      * data by querying with the quadruple
      * "<pid + stime + u_stack_id + k_stack_id + tid + cpu>" as the key.
+     * Note:
+     * In the sampling scenario, the number of samples is used; in the
+     * non-sampling scenario, real-time intervals (in Microseconds) are
+     * used.
      */
     pub count: u32,
     /*
@@ -647,6 +658,9 @@ extern "C" {
         timeout: c_int,
         callback: extern "C" fn(data: *mut c_char, len: c_int),
     ) -> c_int;
+
+    pub fn enable_oncpu_profiler() -> c_int;
+    pub fn disable_oncpu_profiler() -> c_int;
 }
 
 #[no_mangle]
diff --git a/agent/src/ebpf/tools/code.style b/agent/src/ebpf/tools/code.style
new file mode 100644
index 00000000000..ceb7791f998
--- /dev/null
+++ b/agent/src/ebpf/tools/code.style
@@ -0,0 +1,6 @@
+#! /bin/bash
+
+indent -npro -kr -i8 -ts8 -nss -nsc -ncs -nprs -sob -l80 -ss -cp1 --space-after-for --space-after-if --space-after-while --space-special-semicolon --blank-lines-after-procedures -v $1
+sed -i "s/{ }/{}/g" $1
+sed -i "s/) ;/);/g" $1
+sed -i "s/^      //g" $1
diff --git a/agent/src/ebpf/user/config.h b/agent/src/ebpf/user/config.h
index aa675a33c4d..30853862ac5 100644
--- a/agent/src/ebpf/user/config.h
+++ b/agent/src/ebpf/user/config.h
@@ -19,7 +19,7 @@
 
 #define EV_NAME_SIZE			1024
 
-#define BOOT_TIME_UPDATE_PERIOD		60      // 系统启动时间更新周期, 单位：秒
+#define BOOT_TIME_UPDATE_PERIOD		60	// 系统启动时间更新周期, 单位：秒
 
 // eBPF Map Name
 #define MAP_MEMBERS_OFFSET_NAME         "__members_offset"
@@ -68,7 +68,7 @@ enum {
 //thread index for bihash
 enum {
 	THREAD_PROFILER_READER_IDX = 0,
-	THREAD_PROC_ACT_IDX_BASE
+	THREAD_PROC_ACT_IDX_BASE = 2,
 };
 
 /*
@@ -122,7 +122,7 @@ enum {
  * may lead to increased overhead and memory usage, so it is
  * recommended to use it with caution.
  */
-#ifndef PERF_MAX_STACK_DEPTH 
+#ifndef PERF_MAX_STACK_DEPTH
 #define PERF_MAX_STACK_DEPTH		127
 #endif
 
@@ -131,19 +131,19 @@ enum {
  */
 #define MAP_STACK_A_NAME	"__stack_map_a"
 #define MAP_STACK_B_NAME	"__stack_map_b"
-#define MAP_PROFILER_STATE_MAP	"__profiler_state_map"
+#define MAP_PROFILER_STATE_NAME	"__profiler_state_map"
 
 #define STRINGIFIER_STACK_STR_HASH_BUCKETS_NUM	8192
-#define STRINGIFIER_STACK_STR_HASH_MEM_SZ	(1ULL << 30) // 1Gbytes
+#define STRINGIFIER_STACK_STR_HASH_MEM_SZ	(1ULL << 30)	// 1Gbytes
 
 #define SYMBOLIZER_CACHES_HASH_BUCKETS_NUM	8192
-#define SYMBOLIZER_CACHES_HASH_MEM_SZ		(1ULL << 31) // 2Gbytes
+#define SYMBOLIZER_CACHES_HASH_MEM_SZ		(1ULL << 31)	// 2Gbytes
 
 #define STACK_TRACE_MSG_HASH_BUCKETS_NUM	8192
-#define STACK_TRACE_MSG_HASH_MEM_SZ		(1ULL << 32) // 4Gbytes
+#define STACK_TRACE_MSG_HASH_MEM_SZ		(1ULL << 32)	// 4Gbytes
 
-#define PROFILER_READER_EPOLL_TIMEOUT		500 //msecs
-#define EPOLL_SHORT_TIMEOUT			100  //mescs
+#define PROFILER_READER_EPOLL_TIMEOUT		500	//msecs
+#define EPOLL_SHORT_TIMEOUT			100	//mescs
 
 /*
  * Process information recalibration time, this time is the number of seconds
@@ -152,7 +152,7 @@ enum {
  *   The Java process will delay obtaining the symbol table by
  *   'PROC_INFO_VERIFY_TIME' seconds after it starts running.
  */
-#define PROC_INFO_VERIFY_TIME  60 // 60 seconds
+#define PROC_INFO_VERIFY_TIME  60	// 60 seconds
 
 /*
  * This value is used to determine which type of Java agent's so library to
@@ -168,12 +168,12 @@ enum {
  * date the Java symbol table. This is done The purpose is to avoid freque-
  * nt updates of the java symbol table.
  */
-#define JAVA_SYMS_UPDATE_DELAY_DEF 60 // 60 seconds
-#define JAVA_SYMS_UPDATE_DELAY_MIN 5 // 5 seconds
-#define JAVA_SYMS_UPDATE_DELAY_MAX 3600 // 3600 seconds
+#define JAVA_SYMS_UPDATE_DELAY_DEF 60	// 60 seconds
+#define JAVA_SYMS_UPDATE_DELAY_MIN 5	// 5 seconds
+#define JAVA_SYMS_UPDATE_DELAY_MAX 3600	// 3600 seconds
 
 /* Profiler - maximum data push interval time (in nanosecond). */
-#define MAX_PUSH_MSG_TIME_INTERVAL 1000000000ULL	/* 1 seconds */ 
+#define MAX_PUSH_MSG_TIME_INTERVAL 1000000000ULL	/* 1 seconds */
 
 /*
  * timer config
@@ -192,24 +192,24 @@ enum {
  * the data resident in the eBPF buffer. This value is the periodic time, unit
  * is milliseconds.
  */
-#define KICK_KERN_PERIOD 10 // 10 ticks(100 milliseconds)
+#define KICK_KERN_PERIOD 10	// 10 ticks(100 milliseconds)
 
 /*
  * System boot time update cycle time, unit is milliseconds.
  */
-#define SYS_TIME_UPDATE_PERIOD 1000  // 1000 ticks(10 seconds)
+#define SYS_TIME_UPDATE_PERIOD 1000	// 1000 ticks(10 seconds)
 
 /*
  * Check whether the eBPF Map exceeds the maximum value and use it to release
  * stale data (unit is milliseconds).
  */
-#define CHECK_MAP_EXCEEDED_PERIOD 100 // 100 ticks(1 seconds)
+#define CHECK_MAP_EXCEEDED_PERIOD 100	// 100 ticks(1 seconds)
 
 /* 
  * Used to check whether the kernel adaptation is successful, here is the
  * check cycle time (unit is milliseconds).
  */
-#define CHECK_KERN_ADAPT_PERIOD 100 // 100 ticks(1 seconds)
+#define CHECK_KERN_ADAPT_PERIOD 100	// 100 ticks(1 seconds)
 
 /*
  * The maximum space occupied by the Java symbol files in the target POD.
@@ -217,15 +217,15 @@ enum {
  * of 2Mi to 100Mi. If the configuration value is outside this range, the
  * default value of 10(10Mi), will be used.
  */
-#define JAVA_POD_WRITE_FILES_SPACE_MIN 2097152 // 2Mi
-#define JAVA_POD_WRITE_FILES_SPACE_MAX 104857600 // 100Mi
-#define JAVA_POD_WRITE_FILES_SPACE_DEF 10485760 // 10Mi
+#define JAVA_POD_WRITE_FILES_SPACE_MIN 2097152	// 2Mi
+#define JAVA_POD_WRITE_FILES_SPACE_MAX 104857600	// 100Mi
+#define JAVA_POD_WRITE_FILES_SPACE_DEF 10485760	// 10Mi
 /*
  * The `df_java_agent_musl.so` and `df_java_agent.so` files will also be
  * placed in the target POD for loading operations. They occupy less than
  * 300Ki of space.
  */
-#define JAVA_POD_EXTRA_SPACE_MMA 307200 // 300Ki
+#define JAVA_POD_EXTRA_SPACE_MMA 307200	// 300Ki
 
 /*
  * The perf profiler utilizes a perf buffer (per CPUs) for transporting stack data,
@@ -263,6 +263,6 @@ enum {
  * The random value has a maximum limit specified above(measured in seconds). 
  */
 
-#define PROFILER_DEFER_RANDOM_MAX 60 // 60 seconds
+#define PROFILER_DEFER_RANDOM_MAX 60	// 60 seconds
 
 #endif /* DF_EBPF_CONFIG_H */
diff --git a/agent/src/ebpf/user/profile/extended/extended.c b/agent/src/ebpf/user/profile/extended/extended.c
new file mode 100644
index 00000000000..fe4ddb92207
--- /dev/null
+++ b/agent/src/ebpf/user/profile/extended/extended.c
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024 Yunshan Networks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sys/stat.h>
+#include <bcc/perf_reader.h>
+#include "../../config.h"
+#include "../../utils.h"
+#include "../../common.h"
+#include "../../mem.h"
+#include "../../log.h"
+#include "../../types.h"
+#include "../../vec.h"
+#include "../../tracer.h"
+#include "../../socket.h"
+
+int __attribute__ ((weak)) extended_reader_create(struct bpf_tracer *tracer)
+{
+	return 0;
+}
diff --git a/agent/src/ebpf/user/profile/extended/extended.h b/agent/src/ebpf/user/profile/extended/extended.h
new file mode 100644
index 00000000000..0aa597cc161
--- /dev/null
+++ b/agent/src/ebpf/user/profile/extended/extended.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2024 Yunshan Networks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DF_PROFILE_EXT_H
+#define DF_PROFILE_EXT_H
+int extended_reader_create(struct bpf_tracer *tracer);
+#endif /* DF_PROFILE_EXT_H */
diff --git a/agent/src/ebpf/user/profile/java/gen_syms_file.c b/agent/src/ebpf/user/profile/java/gen_syms_file.c
index 042626c3440..0dc31e4a7e6 100644
--- a/agent/src/ebpf/user/profile/java/gen_syms_file.c
+++ b/agent/src/ebpf/user/profile/java/gen_syms_file.c
@@ -62,8 +62,9 @@ void gen_java_symbols_file(int pid, int *ret_val, bool error_occurred)
 
 	char args[PERF_PATH_SZ * 2];
 	snprintf(args, sizeof(args),
-		"%d %d," DF_AGENT_LOCAL_PATH_FMT ".map," DF_AGENT_LOCAL_PATH_FMT ".log",
-		pid, g_java_syms_write_bytes_max, pid, pid);
+		 "%d %d," DF_AGENT_LOCAL_PATH_FMT ".map,"
+		 DF_AGENT_LOCAL_PATH_FMT ".log", pid,
+		 g_java_syms_write_bytes_max, pid, pid);
 
 	i64 curr_local_sz;
 	curr_local_sz = get_local_symbol_file_sz(pid, target_ns_pid);
@@ -131,6 +132,7 @@ void java_syms_update_main(void *arg)
 
 		if (task != NULL) {
 			struct symbolizer_proc_info *p = task->p;
+			symbolizer_proc_lock(p);
 			/* JAVA process has not exited. */
 			if (AO_GET(&p->use) > 1) {
 				int ret;
@@ -151,10 +153,10 @@ void java_syms_update_main(void *arg)
 
 				AO_SET(&p->new_java_syms_file, true);
 			}
-
+			symbolizer_proc_unlock(p);
 			/* Ensure that all tasks are completed before releasing. */
 			if (AO_SUB_F(&p->use, 1) == 0) {
-				clib_mem_free((void *)p);
+				free_proc_cache(p);
 			}
 
 			clib_mem_free((void *)task);
diff --git a/agent/src/ebpf/user/profile/perf_profiler.c b/agent/src/ebpf/user/profile/perf_profiler.c
index 6be2a72f61e..287daa2b6fb 100644
--- a/agent/src/ebpf/user/profile/perf_profiler.c
+++ b/agent/src/ebpf/user/profile/perf_profiler.c
@@ -44,113 +44,37 @@
 #include <regex.h>
 #include "java/config.h"
 #include "java/df_jattach.h"
+#include "profile_common.h"
 
 #include "../perf_profiler_bpf_common.c"
 
-/*
- * This section is for symbolization of Java addresses, and we need
- * to prepare two so librarys, one for GNU and the other for MUSL:
- *
- * df_java_agent.so
- * df_java_agent_musl.so
- *
- * These two files need to be saved in the '/tmp' directory, and the
- * agent so library will be injected into the JVM to generate
- * 'perf-<pid>.map'.
- */
-#include "java_agent_so_gnu.c"
-#include "java_agent_so_musl.c"
-/* use for java symbols generate */
-#include "deepflow_jattach_bin.c"
-
 #define LOG_CP_TAG	"[CP] "
 #define CP_TRACER_NAME	"continuous_profiler"
 #define CP_PERF_PG_NUM	16
 
+static struct profiler_context oncpu_ctx;
+
 /* The maximum bytes limit for writing the df_perf-PID.map file by agent.so */
 int g_java_syms_write_bytes_max;
 
+static bool g_enable_oncpu = true;
+
 /* Used for handling updates to JAVA symbol files */
 static pthread_t java_syms_update_thread;
 
-extern int major, minor;
 extern char linux_release[128];
 extern __thread uword thread_index;
 
-struct stack_trace_key_t *raw_stack_data;
-static u64 stack_trace_lost;
-static struct bpf_tracer *profiler_tracer;
-static volatile u64 profiler_stop;
-
-// for stack_trace_msg_hash relese
-static __thread stack_trace_msg_hash_kv *trace_msg_kvps;
-static __thread bool msg_clear_hash;
+struct bpf_tracer *profiler_tracer;
 
 // for flame-graph test
 static FILE *folded_file;
 #define FOLDED_FILE_PATH "./profiler.folded"
-static char *flame_graph_start_time;
+char *flame_graph_start_time;
 static char *flame_graph_end_time;
 
-/* profiler start time(monotonic seconds). */
-static u64 start_time;
-/* Record the time of the last data push
- * (in seconds since system startup)*/
-static u64 last_push_time;
-static u64 push_count;
-
-/* 
- * To perform regular expression matching on process names,
- * the 'profiler_regex' is set using the 'set_profiler_regex()'
- * interface. Processes that successfully match the regular
- * expression are aggregated using the key:
- * `{pid + stime + u_stack_id + k_stack_id + tid + cpu}`.
- *
- * For processes that do not match, they are aggregated using the
- * key:
- * `<process name + u_stack_id + k_stack_id + cpu>`.
- */
-static regex_t profiler_regex;
-static bool regex_existed = false;
-
-/*
- * 'cpu_aggregation_flag' is used to set whether to retrieve CPUID
- * and include it in the aggregation of stack trace data.
- *
- * If valude is set to 1, CPUID will be retrieved and included in
- * the aggregation of stack trace data. If value is set to 0,
- * CPUID will not be retrieved and will not be included in the
- * aggregation. Any other value is considered invalid.
- */
-static volatile u64 cpu_aggregation_flag;
-
-/*
- * Cache hash: obtain folded stack trace string from stack ID.
- */
-static stack_str_hash_t g_stack_str_hash;
-
-/*
- * Used for tracking data statistics and pushing.
- */
-static stack_trace_msg_hash_t g_msg_hash;
-
-/*
- * The iteration count causes BPF to switch buffers with each iteration.
- */
-static u64 transfer_count;
-static u64 process_count;
-
-static void print_profiler_status(struct bpf_tracer *t, u64 iter_count,
-				  stack_str_hash_t * h,
-				  stack_trace_msg_hash_t * msg_h);
-static void print_cp_tracer_status(struct bpf_tracer *t);
-
-/*
- * During the parsing process, it is possible for processes in procfs
- * to be missing (processes that start and exit quickly). This variable
- * is used to count the number of lost processes during the parsing process.
- */
-static atomic64_t process_lost_count;
+static void print_cp_tracer_status(struct bpf_tracer *t,
+				   struct profiler_context *ctx);
 
 /* Continuous Profiler debug related settings. */
 static pthread_mutex_t cpdbg_mutex;
@@ -160,160 +84,33 @@ static bool cpdbg_use_remote;
 static uint32_t cpdbg_start_time;
 static uint32_t cpdbg_timeout;
 
-/* Record all stack IDs in each iteration for quick retrieval. */
-struct stack_ids_bitmap stack_ids_a;
-struct stack_ids_bitmap stack_ids_b;
-/* This vector table is used to remove a stack from the stack map. */
-static int *clear_stack_ids_a;
-static int *clear_stack_ids_b;
-static u64 stackmap_clear_failed_count;
-
-/* perf buffer queue loss statistics */
-static u64 perf_buf_lost_a_count;
-static u64 perf_buf_lost_b_count;
-
-static volatile u64 g_enable_perf_sample;
-
-static u64 get_process_lost_count(void)
-{
-	return atomic64_read(&process_lost_count);
-}
-
-static inline stack_trace_msg_t *alloc_stack_trace_msg(int len)
-{
-	void *trace_msg;
-	trace_msg = clib_mem_alloc_aligned("stack_msg", len, 0, NULL);
-	if (trace_msg == NULL) {
-		ebpf_warning("stack trace msg alloc memory failed.\n");
-	} else {
-		stack_trace_msg_t *msg = trace_msg;
-		return msg;
-	}
-
-	return NULL;
-}
-
-/* 
- * The invocation of this interface is always when the process name does
- * not match.
- */
-static void set_msg_kvp_by_comm(stack_trace_msg_kv_t * kvp,
-				struct stack_trace_key_t *v, void *msg_value)
+static u64 get_process_lost_count(struct profiler_context *ctx)
 {
-	strcpy_s_inline(kvp->c_k.comm, sizeof(kvp->c_k.comm),
-			v->comm, strlen(v->comm));
-	kvp->c_k.cpu = v->cpu;
-	kvp->c_k.pid = v->tgid;
-	kvp->c_k.reserved = 0;
-	kvp->msg_ptr = pointer_to_uword(msg_value);
-}
-
-static void set_msg_kvp(stack_trace_msg_kv_t * kvp,
-			struct stack_trace_key_t *v, u64 stime, void *msg_value)
-{
-	kvp->k.tgid = v->tgid;
-	kvp->k.pid = v->pid;
-	kvp->k.stime = stime;
-	kvp->k.cpu = v->cpu;
-	kvp->k.u_stack_id = (u32) v->userstack;
-	kvp->k.k_stack_id = (u32) v->kernstack;
-	kvp->msg_ptr = pointer_to_uword(msg_value);
-}
-
-static void set_stack_trace_msg(stack_trace_msg_t * msg,
-				struct stack_trace_key_t *v,
-				bool matched,
-				u64 stime,
-				u64 ns_id,
-				const char *process_name,
-				const char *container_id)
-{
-	msg->pid = v->tgid;
-	msg->tid = v->pid;
-	msg->cpu = v->cpu;
-	msg->u_stack_id = (u32) v->userstack;
-	msg->k_stack_id = (u32) v->kernstack;
-	strcpy_s_inline(msg->comm, sizeof(msg->comm), v->comm, strlen(v->comm));
-	msg->stime = stime;
-	msg->netns_id = ns_id;
-	if (container_id != NULL) {
-		strcpy_s_inline(msg->container_id, sizeof(msg->container_id),
-				container_id, strlen(container_id));
-	}
-
-	if (stime > 0) {
-		/*
-		 * Note: There is no process with PID 0 in procfs.
-		 * If the PID is 0, it will return the kernel's
-		 * startup time, and the process name will be
-		 * obtained from data retrieved through eBPF.
-		 */
-		if (msg->pid == 0) {
-			memcpy(msg->process_name, v->comm, sizeof(msg->comm));
-		} else {
-			if (process_name != NULL) {
-				strcpy_s_inline(msg->process_name,
-						sizeof(msg->process_name),
-						process_name,
-						strlen(process_name));
-			}
-		}
-
-	} else {
-
-		/*
-		 * If the process has already exited, then execution reaches
-		 * this point, which means aggregating data based on the
-		 * process name.
-		 */
-		strcpy_s_inline(msg->process_name, sizeof(msg->process_name),
-				v->comm, strlen(v->comm));
-		atomic64_inc(&process_lost_count);
-	}
-
-	if (!matched || stime <= 0) {
-		/* The aggregation method is identified as 
-		 * { process name + [u,k]stack_trace_id + cpu} */
-		msg->stime = 0;
-		if (!matched) {
-			msg->pid = msg->tid = 0;
-			snprintf((char *)msg->process_name,
-				 sizeof(msg->process_name), "%s", "Total");
-		}
-	}
-
-	msg->time_stamp = gettime(CLOCK_REALTIME, TIME_TYPE_NAN);
-	msg->count = 1;
-	msg->data_ptr = pointer_to_uword(&msg->data[0]);
-
-	/* Only use for test flame graph. */
-	if (flame_graph_start_time == NULL) {
-		flame_graph_start_time = gen_file_name_by_datetime();
-	}
+	return atomic64_read(&ctx->process_lost_count);
 }
 
 static void reader_lost_cb_a(void *cookie, u64 lost)
 {
 	struct bpf_tracer *tracer = profiler_tracer;
 	atomic64_add(&tracer->lost, lost);
-	perf_buf_lost_a_count++;
+	oncpu_ctx.perf_buf_lost_a_count++;
 }
 
 static void reader_lost_cb_b(void *cookie, u64 lost)
 {
 	struct bpf_tracer *tracer = profiler_tracer;
 	atomic64_add(&tracer->lost, lost);
-	perf_buf_lost_b_count++;
+	oncpu_ctx.perf_buf_lost_b_count++;
 }
 
 static void reader_raw_cb(void *cookie, void *raw, int raw_size)
 {
-	if (unlikely(profiler_stop == 1))
+	if (unlikely(oncpu_ctx.profiler_stop == 1))
 		return;
 
 	struct reader_forward_info *fwd_info = cookie;
 	if (unlikely(fwd_info->queue_id != 0)) {
-		ebpf_warning("cookie(%d) error", (u64)cookie);
+		ebpf_warning("cookie(%d) error", (u64) cookie);
 		return;
 	}
 
@@ -322,7 +119,7 @@ static void reader_raw_cb(void *cookie, void *raw, int raw_size)
 	v = (struct stack_trace_key_t *)raw;
 
 	int ret = VEC_OK;
-	vec_add1(raw_stack_data, *v, ret);
+	vec_add1(oncpu_ctx.raw_stack_data, *v, ret);
 	if (ret != VEC_OK) {
 		ebpf_warning("vec add failed\n");
 	}
@@ -340,7 +137,7 @@ static int release_profiler(struct bpf_tracer *tracer)
 	/* free all readers */
 	free_all_readers(tracer);
 
-	print_cp_tracer_status(tracer);
+	print_cp_tracer_status(tracer, &oncpu_ctx);
 
 	/* release object */
 	release_object(tracer->obj);
@@ -350,16 +147,6 @@ static int release_profiler(struct bpf_tracer *tracer)
 	return ETR_OK;
 }
 
-static int init_stack_trace_msg_hash(stack_trace_msg_hash_t * h,
-				     const char *name)
-{
-	memset(h, 0, sizeof(*h));
-	u32 nbuckets = STACK_TRACE_MSG_HASH_BUCKETS_NUM;
-	u64 hash_memory_size = STACK_TRACE_MSG_HASH_MEM_SZ;
-	return stack_trace_msg_hash_init(h, (char *)name,
-					 nbuckets, hash_memory_size);
-}
-
 static inline bool is_cpdbg_timeout(void)
 {
 	uint32_t passed_sec;
@@ -391,12 +178,12 @@ static void print_cp_data(stack_trace_msg_t * msg)
 
 	char buff[DEBUG_BUFF_SIZE];
 	snprintf(buff, sizeof(buff),
-		 "%s [cpdbg] netns_id %lu container_id %s pid %u tid %u "
+		 "%s [cpdbg] type %d netns_id %lu container_id %s pid %u tid %u "
 		 "process_name %s comm %s stime %lu u_stack_id %u k_statck_id"
 		 " %u cpu %u count %u tiemstamp %lu datalen %u data %s\n",
-		 timestamp, msg->netns_id, cid, msg->pid, msg->tid,
-		 msg->process_name, msg->comm, msg->stime,
-		 msg->u_stack_id,
+		 timestamp, msg->profiler_type, msg->netns_id,
+		 cid, msg->pid, msg->tid, msg->process_name, msg->comm,
+		 msg->stime, msg->u_stack_id,
 		 msg->k_stack_id, msg->cpu, msg->count,
 		 msg->time_stamp, msg->data_len, msg->data);
 
@@ -410,7 +197,7 @@ static void print_cp_data(stack_trace_msg_t * msg)
 	}
 }
 
-static void cpdbg_process(stack_trace_msg_t * msg)
+void cpdbg_process(stack_trace_msg_t * msg)
 {
 	pthread_mutex_lock(&cpdbg_mutex);
 	if (unlikely(cpdbg_enable)) {
@@ -421,631 +208,20 @@ static void cpdbg_process(stack_trace_msg_t * msg)
 	pthread_mutex_unlock(&cpdbg_mutex);
 }
 
-static int push_and_free_msg_kvp_cb(stack_trace_msg_hash_kv * kv, void *ctx)
-{
-	stack_trace_msg_kv_t *msg_kv = (stack_trace_msg_kv_t *) kv;
-	if (msg_kv->msg_ptr != 0) {
-		stack_trace_msg_t *msg = (stack_trace_msg_t *) msg_kv->msg_ptr;
-
-		/* continuous profiler debug */
-		cpdbg_process(msg);
-
-		tracer_callback_t fun = profiler_tracer->process_fn;
-		/*
-		 * Execute callback function to hand over the data to the
-		 * higher level for processing. The higher level will se-
-		 * nd the data to the server for storage as required.
-		 */
-		if (likely(profiler_stop == 0))
-			fun(msg);
-
-		clib_mem_free((void *)msg);
-		msg_kv->msg_ptr = 0;
-	}
-
-	int ret = VEC_OK;
-	vec_add1(trace_msg_kvps, *kv, ret);
-	if (ret != VEC_OK) {
-		ebpf_warning("vec add failed\n");
-		msg_clear_hash = true;
-	}
-
-	return BIHASH_WALK_CONTINUE;
-}
-
-/*
- * Push the data and release the resources.
- * @is_force: Do you need to perform a forced release?
- */
-static void push_and_release_stack_trace_msg(stack_trace_msg_hash_t * h,
-					     bool is_force)
-{
-	ASSERT(profiler_tracer != NULL);
-
-	u64 curr_time, elapsed;
-	curr_time = gettime(CLOCK_MONOTONIC, TIME_TYPE_NAN);
-	elapsed = curr_time - last_push_time;
-	/*
-	 * If the aggregated stack trace data obtained by the profiler
-	 * satisfies one of the following conditions, it should be pushed
-	 * to the upper-level processing:
-	 *
-	 *   If the time interval since the last push exceeds or equals
-	 *   the maximum time interval (MAX_PUSH_MSG_TIME_INTERVAL).
-	 *
-	 * Otherwise, it should return directly.
-	 */
-	if (!((elapsed >= MAX_PUSH_MSG_TIME_INTERVAL) || is_force))
-		return;
-
-	/* update last push time. */
-	last_push_time = curr_time;
-	push_count++;
-
-	stack_trace_msg_hash_foreach_key_value_pair(h, push_and_free_msg_kvp_cb,
-						    NULL);
-	/*
-	 * In this iteration, all elements will be cleared, and in the
-	 * next iteration, this hash will be reused.
-	 */
-	stack_trace_msg_hash_kv *v;
-	vec_foreach(v, trace_msg_kvps) {
-		if (stack_trace_msg_hash_add_del(h, v, 0 /* delete */ )) {
-			ebpf_warning
-			    ("stack_trace_msg_hash_add_del() failed.\n");
-			msg_clear_hash = true;
-		}
-	}
-
-	vec_free(trace_msg_kvps);
-
-	h->hit_hash_count = 0;
-	h->hash_elems_count = 0;
-
-	if (msg_clear_hash) {
-		msg_clear_hash = false;
-		stack_trace_msg_hash_free(h);
-	}
-}
-
-static inline void add_stack_id_to_bitmap(int stack_id, bool is_a)
-{
-	if (stack_id < 0)
-		return;
-
-	struct stack_ids_bitmap *ids;
-	if (is_a)
-		ids = &stack_ids_a;
-	else
-		ids = &stack_ids_b;
-
-	if (!is_set_bitmap(ids->bitmap, stack_id)) {
-		set_bitmap(ids->bitmap, stack_id);
-		int ret = VEC_OK;
-
-		if (is_a)
-			vec_add1(clear_stack_ids_a, stack_id, ret);
-		else
-			vec_add1(clear_stack_ids_b, stack_id, ret);
-
-		if (ret != VEC_OK) {
-			ebpf_warning("vec add failed\n");
-		}
-
-		ids->count++;
-	}
-}
-
-static inline void update_matched_process_in_total(stack_trace_msg_hash_t *
-						   msg_hash,
-						   char *process_name,
-						   struct stack_trace_key_t *v)
-{
-	stack_trace_msg_kv_t kv;
-	set_msg_kvp_by_comm(&kv, v, (void *)0);
-
-	if (stack_trace_msg_hash_search
-	    (msg_hash, (stack_trace_msg_hash_kv *) & kv,
-	     (stack_trace_msg_hash_kv *) & kv) == 0) {
-		__sync_fetch_and_add(&msg_hash->hit_hash_count, 1);
-		((stack_trace_msg_t *) kv.msg_ptr)->count++;
-		return;
-	}
-
-	/* append ';' '\0' and '[p/t]' */
-	char trace_str[(TASK_COMM_LEN * 2) + 10];
-	bool is_thread = (v->pid != v->tgid);
-	if (is_thread)
-		snprintf(trace_str, sizeof(trace_str), "[p] %s;[t] %s",
-			 process_name, v->comm);
-	else
-		snprintf(trace_str, sizeof(trace_str), "[p] %s", process_name);
-
-	/* append 2 byte for ';''\0' */
-	int len = sizeof(stack_trace_msg_t) + strlen(trace_str) + 2;
-	stack_trace_msg_t *msg = alloc_stack_trace_msg(len);
-	if (msg == NULL) {
-		clib_mem_free(trace_str);
-		return;
-	}
-
-	set_stack_trace_msg(msg, v, false, 0, 0, process_name, NULL);
-	snprintf((char *)&msg->data[0], strlen(trace_str) + 2, "%s", trace_str);
-	msg->data_len = strlen((char *)msg->data);
-	kv.msg_ptr = pointer_to_uword(msg);
-
-	if (stack_trace_msg_hash_add_del(msg_hash,
-					 (stack_trace_msg_hash_kv
-					  *) & kv, 1 /* is_add */ )) {
-		ebpf_warning("stack_trace_msg_hash_add_del() failed.\n");
-		clib_mem_free(msg);
-	} else {
-		__sync_fetch_and_add(&msg_hash->hash_elems_count, 1);
-	}
-}
-
-static void aggregate_stack_traces(struct bpf_tracer *t,
-				   const char *stack_map_name,
-				   stack_str_hash_t * stack_str_hash,
-				   stack_trace_msg_hash_t * msg_hash,
-				   u32 * count, bool use_a_map)
-{
-	struct stack_trace_key_t *v;
-	vec_foreach(v, raw_stack_data) {
-		if (v == NULL)
-			break;
-
-		if (unlikely(profiler_stop == 1))
-			break;
-
-		/*
-		 * If cpu_aggregation_flag=0, the CPU value for stack trace data
-		 * reporting is a special value (CPU_INVALID:0xfff) used to indicate
-		 * that it is an invalid value, the  CPUID will not be included in
-		 * the aggregation.
-		 */
-		if (cpu_aggregation_flag == 0)
-			v->cpu = CPU_INVALID;
-
-		/*
-		 * Uniform idle process names to reduce the aggregated count of stack
-		 * trace data (when we aggregate using process names as part of the key).
-		 * "swapper/0", "swapper/1", "swapper/2" ... > "swapper" 
-		 */
-		if (v->pid == v->tgid && v->pid == 0) {
-			const char *idle_name = "swapper";
-			strcpy_s_inline(v->comm, sizeof(v->comm),
-					idle_name, strlen(idle_name));
-		}
-
-		/* -EEXIST: Hash bucket collision in the stack trace table */
-		if (v->kernstack == -EEXIST)
-			stack_trace_lost++;
-
-		if (v->userstack == -EEXIST)
-			stack_trace_lost++;
-
-		add_stack_id_to_bitmap(v->kernstack, use_a_map);
-		add_stack_id_to_bitmap(v->userstack, use_a_map);
-
-		/* Total iteration count for this iteration. */
-		(*count)++;
-
-		/* Total iteration count for all iterations. */
-		process_count++;
-
-		/*
-		 * Firstly, search the stack-trace-msg hash to see if the
-		 * stack trace messages has already been stored. 
-		 */
-		stack_trace_msg_kv_t kv;
-		char name[TASK_COMM_LEN];
-		memset(name, 0, sizeof(name));
-		u64 stime, netns_id;
-		stime = netns_id = 0;
-		void *info_p = NULL;
-		char *process_name = NULL;
-		bool matched, is_match_finish;
-		matched = is_match_finish = false;
-
-		/* If it is a process, match operation will be performed immediately. */
-		if (v->pid == v->tgid) {
-			is_match_finish = true;
-			matched = (regexec(&profiler_regex, v->comm, 0, NULL, 0)
-				   == 0);
-			if (!matched) {
-				set_msg_kvp_by_comm(&kv, v, (void *)0);
-				goto skip_proc_find;
-			}
-		}
-
-		get_process_info_by_pid(v->tgid, &stime, &netns_id,
-					(char *)name, &info_p);
-
-		/* 
-		 * If the data collected is from a running process, and the process
-		 * name and the command name of the task (captured by eBPF) are not
-		 * consistent, it indicates that the cached process information is
-		 * no longer valid.
-		 */
-		if (stime > 0 && v->pid == v->tgid && strcmp(name, v->comm)) {
-			stime = netns_id = 0;
-			name[0] = '\0';
-			process_name = NULL;
-			info_p = NULL;
-		}
-
-		if (stime > 0) {
-			if (v->tgid == 0)
-				process_name = v->comm;
-			else
-				process_name = name;
-
-			if (!is_match_finish)
-				matched =
-				    (regexec
-				     (&profiler_regex, process_name, 0, NULL, 0)
-				     == 0);
-			if (matched)
-				set_msg_kvp(&kv, v, stime, (void *)0);
-			else
-				set_msg_kvp_by_comm(&kv, v, (void *)0);
-		} else {
-			/* Not find process in procfs. */
-			set_msg_kvp_by_comm(&kv, v, (void *)0);
-		}
-
-		/*
-		 * Here, we duplicate the matched process data and place it into
-		 * the Total process, with the aim of showcasing the proportion
-		 * of each process in the overall sampling.
-		 */
-		if (matched)
-			update_matched_process_in_total(msg_hash, process_name,
-							v);
-
-	      skip_proc_find:
-		if (stack_trace_msg_hash_search
-		    (msg_hash, (stack_trace_msg_hash_kv *) & kv,
-		     (stack_trace_msg_hash_kv *) & kv) == 0) {
-			__sync_fetch_and_add(&msg_hash->hit_hash_count, 1);
-			((stack_trace_msg_t *) kv.msg_ptr)->count++;
-			continue;
-		}
-
-		/*
-		 * Folded stack trace string and generate stack trace messages.
-		 *
-		 * Folded stack trace string (taken from a performance profiler test):
-		 * main;xxx();yyy()
-		 * It is a list of symbols corresponding to addresses in the underlying
-		 * stack trace, separated by ';'.
-		 */
-
-		char *trace_str =
-		    resolve_and_gen_stack_trace_str(t, v, stack_map_name,
-						    stack_str_hash, matched,
-						    process_name, info_p);
-		if (trace_str) {
-			/*
-			 * append process/thread name to stack string
-			 * append 2 byte for ';''\0'
-			 * append pre_tag '[p/t]'
-			 */
-			char pre_tag[5];
-			int str_len = strlen(trace_str) + 2;
-			if (matched)
-				str_len += strlen(v->comm) + sizeof(pre_tag);
-
-			int len = sizeof(stack_trace_msg_t) + str_len;
-			stack_trace_msg_t *msg = alloc_stack_trace_msg(len);
-			if (msg == NULL) {
-				clib_mem_free(trace_str);
-				continue;
-			}
-
-			memset(msg, 0, len);
-			struct symbolizer_proc_info *__p = info_p;
-			set_stack_trace_msg(msg, v, matched, stime, netns_id,
-					    process_name,
-					    __p ? __p->container_id : NULL);
-
-			snprintf(pre_tag, sizeof(pre_tag), "%s ",
-				 v->pid == v->tgid ? "[p]" : "[t]");
-			if (matched)
-				snprintf((char *)&msg->data[0], str_len,
-					 "%s%s;%s", pre_tag, v->comm,
-					 trace_str);
-			else
-				snprintf((char *)&msg->data[0], str_len, "%s",
-					 trace_str);
-
-			msg->data_len = strlen((char *)msg->data);
-			clib_mem_free(trace_str);
-			kv.msg_ptr = pointer_to_uword(msg);
-
-			if (stack_trace_msg_hash_add_del(msg_hash,
-							 (stack_trace_msg_hash_kv
-							  *) & kv,
-							 1 /* is_add */ )) {
-				ebpf_warning
-				    ("stack_trace_msg_hash_add_del() failed.\n");
-				clib_mem_free(msg);
-			} else {
-				__sync_fetch_and_add
-				    (&msg_hash->hash_elems_count, 1);
-			}
-		}
-
-		/* check and clean symbol cache */
-		exec_proc_info_cache_update();
-	}
-
-	vec_free(raw_stack_data);
-}
-
-void set_enable_perf_sample(struct bpf_tracer *t, u64 enable_flag)
-{
-	if (bpf_table_set_value(t, MAP_PROFILER_STATE_MAP,
-				ENABLE_IDX, &enable_flag) == false) {
-		ebpf_warning("profiler state map update error."
-			     "(%s enable_flag %lu) - %s\n",
-			     MAP_PROFILER_STATE_MAP,
-			     enable_flag, strerror(errno));
-	}
-
-	g_enable_perf_sample = enable_flag;
-
-	ebpf_info("%s() success, enable_flag:%d\n", __func__, enable_flag);
-}
-
-static u32 delete_all_stackmap_elems(struct bpf_tracer *tracer,
-				     const char *stack_map_name)
-{
-	struct ebpf_map *map =
-	    ebpf_obj__get_map_by_name(tracer->obj, stack_map_name);
-	if (map == NULL) {
-		ebpf_warning("[%s] map(name:%s) is NULL.\n", __func__,
-			     stack_map_name);
-		return 0;
-	}
-	int map_fd = map->fd;
-
-	u32 key = 0, next_key;
-	u32 reclaim_count = 0;
-	u32 find_count = 0;
-	struct list_head clear_elem_head;
-	init_list_head(&clear_elem_head);
-
-	while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
-		find_count++;
-		insert_list(&next_key, sizeof(next_key), &clear_elem_head);
-		key = next_key;
-	}
-
-	reclaim_count = __reclaim_map(map_fd, &clear_elem_head);
-
-	ebpf_info("[%s] table %s find_count %u reclaim_count :%u\n",
-		  __func__, stack_map_name, find_count, reclaim_count);
-
-	return reclaim_count;
-}
-
-static void cleanup_stackmap(struct bpf_tracer *t,
-			     const char *stack_map_name, bool is_a)
-{
-	struct stack_ids_bitmap *ids;
-	int *clear_stack_ids;
-	u64 *perf_buf_lost_p = NULL;
-
-	if (is_a) {
-		ids = &stack_ids_a;
-		clear_stack_ids = clear_stack_ids_a;
-		perf_buf_lost_p = &perf_buf_lost_a_count;
-	} else {
-		ids = &stack_ids_b;
-		clear_stack_ids = clear_stack_ids_b;
-		perf_buf_lost_p = &perf_buf_lost_b_count;
-	}
-
-	if (ids->count != vec_len(clear_stack_ids)) {
-		ebpf_warning
-		    ("stack_ids.count(%lu) != vec_len(clear_stack_ids)(%d)",
-		     ids->count, vec_len(clear_stack_ids));
-	}
-
-	/*
-	 * The perf profiler utilizes a perf buffer (per CPUs) for transporting stack data,
-	 * which may lead to out-of-order behavior in a multi-core environment.
-	 * We have employed a threshold to delay the cleanup of the stack map, reducing the
-	 * occurrence of premature clearing of stack entries caused by the disorder in stack
-	 * data.
-	 *
-	 * Examine the detailed explanation of 'STACKMAP_CLEANUP_THRESHOLD' in
-	 * 'agent/src/ebpf/user/config.h'.
-	 */
-	if (ids->count >= STACKMAP_CLEANUP_THRESHOLD) {
-		int *sid;
-		vec_foreach(sid, clear_stack_ids) {
-			int id = *sid;
-			if (!bpf_table_delete_key(t, stack_map_name, (u64) id)) {
-				/*
-				 * It may be due to the disorder in the perf buffer transmission,
-				 * leading to the repetitive deletion of the same stack ID.
-				 */
-				stackmap_clear_failed_count++;
-			}
-
-			clear_bitmap(ids->bitmap, id);
-		}
-
-		if (is_a)
-			vec_free(clear_stack_ids_a);
-		else
-			vec_free(clear_stack_ids_b);
-
-		ids->count = 0;
-
-		/*
-		 * If data loss occurs due to the user-space receiver program
-		 * being too busy and not promptly fetching data from the perf
-		 * buffer, it is necessary to clean the stack map once to prevent
-		 * excessive remnants of stack data from affecting the acquisition
-		 * of new stack data (i.e., eBPF using the bpf_get_stackid()
-		 * interface will return -EEXIST).
-		 */
-		if (*perf_buf_lost_p > 0) {
-			delete_all_stackmap_elems(t, stack_map_name);
-			*perf_buf_lost_p = 0;
-		}
-	}
-}
-
-static void process_bpf_stacktraces(struct bpf_tracer *t,
-				    struct bpf_perf_reader *r_a,
-				    struct bpf_perf_reader *r_b)
-{
-	struct bpf_perf_reader *r;
-	const char *stack_map_name;
-	bool using_map_set_a = (transfer_count % 2 == 0);
-	r = using_map_set_a ? r_a : r_b;
-	stack_map_name = using_map_set_a ? MAP_STACK_A_NAME : MAP_STACK_B_NAME;
-	const u64 sample_count_idx =
-	    using_map_set_a ? SAMPLE_CNT_A_IDX : SAMPLE_CNT_B_IDX;
-
-	struct epoll_event events[r->readers_count];
-	int nfds = reader_epoll_wait(r, events, 0);
-
-	transfer_count++;
-	/* update map MAP_PROFILER_STATE_MAP */
-	if (bpf_table_set_value(t, MAP_PROFILER_STATE_MAP,
-				TRANSFER_CNT_IDX, &transfer_count) == false) {
-		ebpf_warning("profiler state map update error."
-			     "(%s transfer_count %lu) - %s\n",
-			     MAP_PROFILER_STATE_MAP,
-			     transfer_count, strerror(errno));
-		transfer_count--;
-	}
-
-	/* Total iteration count for this iteration. */
-	u32 count = 0;
-
-	/* eBPF map record count for this iteration. */
-	u64 sample_cnt_val = 0;
-
-	/*
-	 * Why use g_stack_str_hash?
-	 *
-	 * When the stringizer encounters a stack-ID for the first time in
-	 * the stack trace table, it clears it. If a stack-ID is reused by
-	 * different stack trace keys, the stringizer returns its memoized
-	 * stack trace string. Since stack IDs are unstable between profile
-	 * iterations, we create and destroy the stringizer in each profile
-	 * iteration.
-	 */
-	if (unlikely(g_stack_str_hash.buckets == NULL)) {
-		if (init_stack_str_hash(&g_stack_str_hash, "profile_stack_str")) {
-			ebpf_warning("init_stack_str_hash() failed.\n");
-			return;
-		}
-	}
-
-	/*
-	 * During each transmission iteration, we have a hashmap structure in
-	 * place for the following purposes:
-	 *
-	 * 1 Pushing the data of this iteration to the higher-level processing.
-	 * 2 Performing data statistics based on the stack trace data, using the
-	 *   combination of "tgid + tgid_start_time + pid + cpu + k_stack_id +
-	 *   u_stack_id + " as the key.
-	 *
-	 * Here is the key-value pair structure of the hashmap:
-	 * see perf_profiler.h (stack_trace_msg_kv_t)
-	 * This is the final form of the data. If the current stack trace message
-	 * is a match, we only need to increment the count field in the correspon-
-	 * ding value, thus avoiding duplicate parsing.
-	 */
-	if (unlikely(g_msg_hash.buckets == NULL)) {
-		if (init_stack_trace_msg_hash(&g_msg_hash, "stack_trace_msg")) {
-			ebpf_warning("init_stack_trace_msg_hash() failed.\n");
-			return;
-		}
-	}
-
-	if (nfds > 0) {
-
-	      check_again:
-		if (unlikely(profiler_stop == 1))
-			goto release_iter;
-
-		/* 
-		 * If there is data, the reader's callback
-		 * function will be called.
-		 */
-		reader_event_read(events, nfds);
-
-		/*
-		 * After the reader completes data reading, the work of
-		 * data aggregation will be blocked if there is no data.
-		 */
-		aggregate_stack_traces(t, stack_map_name, &g_stack_str_hash,
-				       &g_msg_hash, &count, using_map_set_a);
-
-		/*
-		 * To ensure that all data in the perf ring-buffer is procenssed
-		 * in this iteration, as this iteration will clean up all the
-		 * data recorded in the stackmap, any residual data in the perf
-		 * ring-buffer will be carried over to the next iteration for
-		 * processing. This poses a risk of not being able to find the
-		 * corresponding stackmap records in the next iteration, leading
-		 * to incomplete processing.
-		 */
-		if (bpf_table_get_value(t, MAP_PROFILER_STATE_MAP,
-					sample_count_idx,
-					(void *)&sample_cnt_val)) {
-			if (sample_cnt_val > count) {
-				nfds = reader_epoll_short_wait(r, events, 0);
-				if (nfds > 0)
-					goto check_again;
-			}
-		}
-	}
-
-release_iter:
-
-	cleanup_stackmap(t, stack_map_name, using_map_set_a);
-
-	/* Now that we've consumed the data, reset the sample count in BPF. */
-	sample_cnt_val = 0;
-	bpf_table_set_value(t, MAP_PROFILER_STATE_MAP,
-			    sample_count_idx, &sample_cnt_val);
-
-	print_profiler_status(t, count, &g_stack_str_hash, &g_msg_hash);
-
-	/* free all elems */
-	clean_stack_strs(&g_stack_str_hash);
-
-	/* Push messages and free stack_trace_msg_hash */
-	push_and_release_stack_trace_msg(&g_msg_hash, false);
-}
-
 static void java_syms_update_work(void *arg)
 {
 	java_syms_update_main(arg);
 }
 
-static void cp_reader_work(void *arg)
+static void oncpu_reader_work(void *arg)
 {
 	thread_index = THREAD_PROFILER_READER_IDX;
 	struct bpf_tracer *t = profiler_tracer;
-	struct bpf_perf_reader *reader_a, *reader_b;
-	reader_a = &t->readers[0];
-	reader_b = &t->readers[1];
 
 	for (;;) {
-		if (unlikely(profiler_stop == 1)) {
-			if (g_enable_perf_sample)
-				set_enable_perf_sample(t, 0);
+		if (unlikely(oncpu_ctx.profiler_stop == 1)) {
+			if (oncpu_ctx.enable_bpf_profile)
+				set_bpf_run_enabled(t, &oncpu_ctx, 0);
 
 			goto exit;
 		}
@@ -1060,45 +236,44 @@ static void cp_reader_work(void *arg)
 		 * exit events. We want to ensure that everything is ready
 		 * before the profiler performs address translation.
 		 */
-		if (unlikely(!regex_existed ||
+		if (unlikely(!oncpu_ctx.regex_existed ||
 			     get_socket_tracer_state() != TRACER_RUNNING)) {
-			if (g_enable_perf_sample)
-				set_enable_perf_sample(t, 0);
+			if (oncpu_ctx.enable_bpf_profile)
+				set_bpf_run_enabled(t, &oncpu_ctx, 0);
 			exec_proc_info_cache_update();
 			sleep(1);
 			continue;
 		}
 
-		if (unlikely(!g_enable_perf_sample))
-			set_enable_perf_sample(t, 1);
+		if (unlikely(!oncpu_ctx.enable_bpf_profile))
+			set_bpf_run_enabled(t, &oncpu_ctx, 1);
 
-		tracer_reader_lock(t);
-		process_bpf_stacktraces(t, reader_a, reader_b);
-		tracer_reader_unlock(t);
+		process_bpf_stacktraces(&oncpu_ctx, t);
 	}
 
 exit:
-	print_cp_tracer_status(t);
+	print_cp_tracer_status(t, &oncpu_ctx);
 
-	print_hash_stack_str(&g_stack_str_hash);
+	print_hash_stack_str(&oncpu_ctx.stack_str_hash);
 	/* free stack_str_hash */
-	if (likely(g_stack_str_hash.buckets != NULL)) {
-		release_stack_str_hash(&g_stack_str_hash);
+	if (likely(oncpu_ctx.stack_str_hash.buckets != NULL)) {
+		release_stack_str_hash(&oncpu_ctx.stack_str_hash);
 	}
 
-	print_hash_stack_trace_msg(&g_msg_hash);
+	print_hash_stack_trace_msg(&oncpu_ctx.msg_hash);
 	/* free stack_str_hash */
-	if (likely(g_msg_hash.buckets != NULL)) {
+	if (likely(oncpu_ctx.msg_hash.buckets != NULL)) {
 		/* Ensure that all elements are released properly/cleanly */
-		push_and_release_stack_trace_msg(&g_msg_hash, true);
-		stack_trace_msg_hash_free(&g_msg_hash);
+		push_and_release_stack_trace_msg(&oncpu_ctx,
+						 &oncpu_ctx.msg_hash, true);
+		stack_trace_msg_hash_free(&oncpu_ctx.msg_hash);
 	}
 
 	/* resouce share release */
 	release_symbol_caches();
 
 	/* clear thread */
-	t->perf_worker[0] = 0;
+	t->perf_workers[THREAD_PROFILER_READER_IDX] = 0;
 	ebpf_info(LOG_CP_TAG "perf profiler reader-thread exit.\n");
 
 	pthread_exit(NULL);
@@ -1114,42 +289,10 @@ static int create_profiler(struct bpf_tracer *tracer)
 	if (tracer_bpf_load(tracer))
 		return ETR_LOAD;
 
-	set_enable_perf_sample(tracer, 0);
-
-	/*
-	 * create reader for read eBPF-profiler data.
-	 * To implement eBPF perf-profiler double buffering output,
-	 * it is necessary to create two readers to correspond to
-	 * the double buffering structure design.
-	 */
-	struct bpf_perf_reader *reader_a, *reader_b;
-	reader_a = create_perf_buffer_reader(tracer,
-					     MAP_PERF_PROFILER_BUF_A_NAME,
-					     reader_raw_cb,
-					     reader_lost_cb_a,
-					     PROFILE_PG_CNT_DEF, 1,
-					     PROFILER_READER_EPOLL_TIMEOUT);
-	if (reader_a == NULL)
-		return ETR_NORESOURCE;
-
-	reader_b = create_perf_buffer_reader(tracer,
-					     MAP_PERF_PROFILER_BUF_B_NAME,
-					     reader_raw_cb,
-					     reader_lost_cb_b,
-					     PROFILE_PG_CNT_DEF, 1,
-					     PROFILER_READER_EPOLL_TIMEOUT);
-	if (reader_b == NULL) {
-		free_perf_buffer_reader(reader_a);
-		return ETR_NORESOURCE;
-	}
-
 	/* clear old perf files */
 	exec_command("/usr/bin/rm -rf /tmp/perf-*.map", "");
 	exec_command("/usr/bin/rm -rf /tmp/perf-*.log", "");
 
-	/* attach perf event */
-	tracer_hooks_attach(tracer);
-
 	ret = create_work_thread("java_update",
 				 &java_syms_update_thread,
 				 (void *)java_syms_update_work, (void *)tracer);
@@ -1158,17 +301,67 @@ static int create_profiler(struct bpf_tracer *tracer)
 		goto error;
 	}
 
-	/*
-	 * Start a new thread to execute the data
-	 * reading of perf buffer.
-	 */
-	ret = enable_tracer_reader_work("cp_reader", 0, tracer,
-					(void *)&cp_reader_work);
+	if (g_enable_oncpu) {
+		ebpf_info(LOG_CP_TAG "=== oncpu profiler enabled ===\n");
+		tracer->enable_sample = true;
+		set_bpf_run_enabled(tracer, &oncpu_ctx, 0);
 
-	if (ret) {
-		goto error;
+		/*
+		 * create reader for read eBPF-profiler data.
+		 * To implement eBPF perf-profiler double buffering output,
+		 * it is necessary to create two readers to correspond to
+		 * the double buffering structure design.
+		 */
+		struct bpf_perf_reader *reader_a, *reader_b;
+		reader_a = create_perf_buffer_reader(tracer,
+						     MAP_PERF_PROFILER_BUF_A_NAME,
+						     reader_raw_cb,
+						     reader_lost_cb_a,
+						     PROFILE_PG_CNT_DEF, 1,
+						     PROFILER_READER_EPOLL_TIMEOUT);
+		if (reader_a == NULL)
+			return ETR_NORESOURCE;
+
+		reader_b = create_perf_buffer_reader(tracer,
+						     MAP_PERF_PROFILER_BUF_B_NAME,
+						     reader_raw_cb,
+						     reader_lost_cb_b,
+						     PROFILE_PG_CNT_DEF, 1,
+						     PROFILER_READER_EPOLL_TIMEOUT);
+		if (reader_b == NULL) {
+			free_perf_buffer_reader(reader_a);
+			return ETR_NORESOURCE;
+		}
+
+		oncpu_ctx.r_a = reader_a;
+		oncpu_ctx.r_b = reader_b;
+
+		/*
+		 * Start a new thread to execute the data
+		 * reading of perf buffer.
+		 */
+		ret =
+		    enable_tracer_reader_work("oncpu_reader",
+					      THREAD_PROFILER_READER_IDX,
+					      tracer,
+					      (void *)&oncpu_reader_work);
+
+		if (ret) {
+			goto error;
+		}
+	} else {
+		tracer->enable_sample = false;
+		ebpf_info(LOG_CP_TAG "=== oncpu profiler disabled ===\n");
 	}
 
+	if (tracer_probes_init(tracer))
+		return (-1);
+
+	extended_reader_create(tracer);
+
+	/* attach perf event */
+	tracer_hooks_attach(tracer);
+
 	return ETR_OK;
 
 error:
@@ -1176,9 +369,28 @@ static int create_profiler(struct bpf_tracer *tracer)
 	return ETR_INVAL;
 }
 
+static inline bool all_perf_workers_exit(struct bpf_tracer *t)
+{
+	int i;
+	int count = ARRAY_SIZE(t->perf_workers);
+	for (i = 0; i < count; i++) {
+		if (t->perf_workers[i])
+			return false;
+	}
+
+	return true;
+}
+
 int stop_continuous_profiler(void)
 {
-	profiler_stop = 1;
+	oncpu_ctx.profiler_stop = 1;
+	if (profiler_tracer == NULL)
+		return (0);
+
+	// Wait for all reader threads to exit.
+	while (!all_perf_workers_exit(profiler_tracer))
+		sleep(1);
+
 	if (flame_graph_end_time == NULL) {
 		flame_graph_end_time = gen_file_name_by_datetime();
 	}
@@ -1188,9 +400,9 @@ int stop_continuous_profiler(void)
 
 	u64 alloc_b, free_b;
 	get_mem_stat(&alloc_b, &free_b);
-	if (regex_existed) {
-		regfree(&profiler_regex);
-		regex_existed = false;
+	if (oncpu_ctx.regex_existed) {
+		regfree(&oncpu_ctx.profiler_regex);
+		oncpu_ctx.regex_existed = false;
 	}
 
 	ebpf_info(LOG_CP_TAG "== alloc_b %lu bytes, free_b %lu bytes, "
@@ -1198,37 +410,38 @@ int stop_continuous_profiler(void)
 	return (0);
 }
 
-static void print_cp_tracer_status(struct bpf_tracer *t)
+static void print_cp_tracer_status(struct bpf_tracer *t,
+				   struct profiler_context *ctx)
 {
 	u64 alloc_b, free_b;
 	get_mem_stat(&alloc_b, &free_b);
 
 	u64 sample_drop_cnt = 0;
-	if (!bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, SAMPLE_CNT_DROP,
+	if (!bpf_table_get_value(t, ctx->state_map_name, SAMPLE_CNT_DROP,
 				 (void *)&sample_drop_cnt)) {
 		ebpf_warning("Get map '%s' sample_drop_cnt failed.\n",
-			     MAP_PROFILER_STATE_MAP);
+			     ctx->state_map_name);
 	}
 
 	u64 output_err_cnt = 0;
-	if (!bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, ERROR_IDX,
+	if (!bpf_table_get_value(t, ctx->state_map_name, ERROR_IDX,
 				 (void *)&output_err_cnt)) {
 		ebpf_warning("Get map '%s' output_err_cnt failed.\n",
-			     MAP_PROFILER_STATE_MAP);
+			     ctx->state_map_name);
 	}
 
 	u64 output_count = 0;
-	if (!bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, OUTPUT_CNT_IDX,
+	if (!bpf_table_get_value(t, ctx->state_map_name, OUTPUT_CNT_IDX,
 				 (void *)&output_count)) {
 		ebpf_warning("Get map '%s' output_cnt failed.\n",
-			     MAP_PROFILER_STATE_MAP);
+			     ctx->state_map_name);
 	}
 
 	u64 iter_max_cnt = 0;
-	if (!bpf_table_get_value(t, MAP_PROFILER_STATE_MAP, SAMPLE_ITER_CNT_MAX,
+	if (!bpf_table_get_value(t, ctx->state_map_name, SAMPLE_ITER_CNT_MAX,
 				 (void *)&iter_max_cnt)) {
 		ebpf_warning("Get map '%s' iter_max_cnt failed.\n",
-			     MAP_PROFILER_STATE_MAP);
+			     ctx->state_map_name);
 	}
 
 	ebpf_info("\n\n----------------------------\nrecv envent:\t%lu\n"
@@ -1236,7 +449,7 @@ static void print_cp_tracer_status(struct bpf_tracer *t)
 		  "perf_buf_lost_b:\t%lu process_lost_count:\t%lu "
 		  "stack_table_data_miss:\t%lu\n"
 		  "stackmap_clear_failed_count\t%lu\n"
-		  "stack_trace_lost:\t%lu\ntransfer_count:\t%lu "
+		  "stack_trace_err:\t%lu\ntransfer_count:\t%lu "
 		  "iter_count_avg:\t%.2lf\nalloc_b:\t%lu bytes "
 		  "free_b:\t%lu bytes use:\t%lu bytes\n"
 		  "eBPF map status:\n"
@@ -1245,77 +458,21 @@ static void print_cp_tracer_status(struct bpf_tracer *t)
 		  " - output_err_cnt:\t%lu\n"
 		  " - iter_max_cnt:\t%lu\n"
 		  "----------------------------\n\n",
-		  atomic64_read(&t->recv), process_count,
-		  atomic64_read(&t->lost), perf_buf_lost_a_count,
-		  perf_buf_lost_b_count, perf_buf_lost_a_count,
-		  perf_buf_lost_b_count, get_process_lost_count(),
+		  atomic64_read(&t->recv), ctx->process_count,
+		  atomic64_read(&t->lost), ctx->perf_buf_lost_a_count,
+		  ctx->perf_buf_lost_b_count, ctx->perf_buf_lost_a_count,
+		  ctx->perf_buf_lost_b_count, get_process_lost_count(ctx),
 		  get_stack_table_data_miss_count(),
-		  stackmap_clear_failed_count, stack_trace_lost, transfer_count,
-		  ((double)atomic64_read(&t->recv) / (double)transfer_count),
-		  alloc_b, free_b, alloc_b - free_b, output_count,
-		  sample_drop_cnt, output_err_cnt, iter_max_cnt);
-}
-
-static void print_profiler_status(struct bpf_tracer *t, u64 iter_count,
-				  stack_str_hash_t * h,
-				  stack_trace_msg_hash_t * msg_h)
-{
-	u64 alloc_b, free_b;
-	get_mem_stat(&alloc_b, &free_b);
-	ebpf_debug("\n\n----------------------------\nrecv envent:\t%lu\n"
-		   "kern_lost:\t%lu, perf_buf_lost_a:\t%lu, perf_buf_lost_b:\t%lu\n"
-		   "stack_trace_lost:\t%lu\n"
-		   "stackmap_clear_failed_count\t%lu\n"
-		   "ransfer_count:\t%lu iter_count:\t%lu\nall"
-		   "oc_b:\t%lu bytes free_b:\t%lu bytes use:\t%lu bytes\n"
-		   "stack_str_hash.hit_count %lu\nstack_trace_msg_hash hit %lu\n",
-		   atomic64_read(&t->recv), atomic64_read(&t->lost),
-		   perf_buf_lost_a_count, perf_buf_lost_b_count,
-		   stack_trace_lost, stackmap_clear_failed_count,
-		   transfer_count, iter_count,
-		   alloc_b, free_b, alloc_b - free_b,
-		   h->hit_hash_count, msg_h->hit_hash_count);
-}
-
-/*
- * View kernel addresses exposed via /proc and other interfaces
- * when /proc/sys/kernel/kptr_restrict has the value 1, it is
- * necessary to set the CAP_SYSLOG capability, otherwise all k-
- * ernel addresses are set to 0.
- *
- * This function is used to check if the kernel address is 0.
- */
-static bool check_kallsyms_addr_is_zero(void)
-{
-	const int check_num = 100;
-	const int max_line_len = 256;
-	const char *check_str = "0000000000000000";
-
-	FILE *file = fopen("/proc/kallsyms", "r");
-	if (file == NULL) {
-		ebpf_warning(LOG_CP_TAG "Error opening /proc/kallsyms");
-		return false;
-	}
-
-	char line[max_line_len];
-	int count = 0;
-
-	while (fgets(line, sizeof(line), file) != NULL && count < check_num) {
-		char address[17];	// 16 characters + null terminator
-		sscanf(line, "%16s", address);
-
-		if (strcmp(address, check_str) == 0) {
-			count++;
-		}
-	}
-
-	fclose(file);
-
-	return (count == check_num);
+		  ctx->stackmap_clear_failed_count, ctx->stack_trace_err,
+		  ctx->transfer_count,
+		  ((double)atomic64_read(&t->recv) /
+		   (double)ctx->transfer_count), alloc_b, free_b,
+		  alloc_b - free_b, output_count, sample_drop_cnt,
+		  output_err_cnt, iter_max_cnt);
 }
 
 static int cpdbg_sockopt_get(sockoptid_t opt, const void *conf, size_t size,
-			     void **out, size_t * outsize)
+			     void **out, size_t *outsize)
 {
 	return 0;
 }
@@ -1369,6 +526,7 @@ static struct tracer_sockopts cpdbg_sockopts = {
  * @callback Profile data processing callback interface
  * @returns 0 on success, < 0 on error
  */
+
 int start_continuous_profiler(int freq, int java_syms_space_limit,
 			      int java_syms_update_delay,
 			      tracer_callback_t callback)
@@ -1377,26 +535,13 @@ int start_continuous_profiler(int freq, int java_syms_space_limit,
 	void *bpf_bin_buffer;
 	uword buffer_sz;
 
-	// REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
-	if (check_kernel_version(4, 9) != 0) {
-		ebpf_warning
-		    (LOG_CP_TAG
-		     "Currnet linux %d.%d, not support, require Linux 4.9+\n",
-		     major, minor);
-
+	if (!run_conditions_check())
 		return (-1);
-	}
 
-	if (check_kallsyms_addr_is_zero()) {
-		ebpf_warning(LOG_CP_TAG
-			     "All kernel addresses in /proc/kallsyms are 0, Please"
-			     " follow the steps below to resolve:\n"
-			     "1 Make sure the content of the '/proc/sys/kernel/kpt"
-			     "r_restrict' file is not 2, if it is 2 please set it "
-			     "to 1.\n2 Add 'CAP_SYSLOG' permission to the containe"
-			     "r.\n3 Restart the pod.");
-		return (-1);
-	}
+	profiler_context_init(&oncpu_ctx, LOG_CP_TAG,
+			      PROFILER_TYPE_ONCPU, g_enable_oncpu,
+			      MAP_PROFILER_STATE_NAME, MAP_STACK_A_NAME,
+			      MAP_STACK_B_NAME, false, false);
 
 	int java_space_bytes = java_syms_space_limit * 1024 * 1024;
 	if ((java_space_bytes < JAVA_POD_WRITE_FILES_SPACE_MIN) ||
@@ -1413,86 +558,35 @@ int start_continuous_profiler(int freq, int java_syms_space_limit,
 	set_java_syms_fetch_delay(java_syms_update_delay);
 	ebpf_info("set java_syms_update_delay : %lu\n", java_syms_update_delay);
 
-	atomic64_init(&process_lost_count);
-
 	/*
 	 * Initialize cpdbg
 	 */
 	pthread_mutex_init(&cpdbg_mutex, NULL);
 
-	profiler_stop = 0;
-	start_time = gettime(CLOCK_MONOTONIC, TIME_TYPE_SEC);
-
 	// CPUID will not be included in the aggregation of stack trace data.
 	set_profiler_cpu_aggregation(0);
 
-	// Java agent so library generation.
-	if (access(AGENT_LIB_SRC_PATH, F_OK) == 0) {
-		if (unlink(AGENT_LIB_SRC_PATH) != 0) {
-			ebpf_warning(LOG_CP_TAG "rm file %s failed.\n",
-				     AGENT_LIB_SRC_PATH);
-			return (-1);
-		}
-	}
-
-	if (access(AGENT_MUSL_LIB_SRC_PATH, F_OK) == 0) {
-		if (unlink(AGENT_MUSL_LIB_SRC_PATH) != 0) {
-			ebpf_warning(LOG_CP_TAG "rm file %s failed.\n",
-				     AGENT_MUSL_LIB_SRC_PATH);
-			return (-1);
-		}
-	}
-
-	if (gen_file_from_mem((const char *)java_agent_so_gnu,
-			      sizeof(java_agent_so_gnu),
-			      (const char *)AGENT_LIB_SRC_PATH)) {
-		ebpf_warning(LOG_CP_TAG
-			     "Java agent so library(%s) generate failed.\n",
-			     AGENT_LIB_SRC_PATH);
-		return (-1);
-	}
-
-	if (gen_file_from_mem((const char *)java_agent_so_musl,
-			      sizeof(java_agent_so_musl),
-			      (const char *)AGENT_MUSL_LIB_SRC_PATH)) {
-		ebpf_warning(LOG_CP_TAG
-			     "Java agent so library(%s) generate failed.\n",
-			     AGENT_MUSL_LIB_SRC_PATH);
-		return (-1);
-	}
-
-	/* For java attach tool */
-	if (access(JAVA_ATTACH_TOOL_PATH, F_OK) == 0) {
-		if (unlink(JAVA_ATTACH_TOOL_PATH) != 0) {
-			ebpf_warning(LOG_CP_TAG "rm file %s failed.\n",
-				     JAVA_ATTACH_TOOL_PATH);
-			return (-1);
-		}
-	}
-
-	if (gen_file_from_mem((const char *)deepflow_jattach_bin,
-			      sizeof(deepflow_jattach_bin),
-			      (const char *)JAVA_ATTACH_TOOL_PATH)) {
-		ebpf_warning(LOG_CP_TAG
-			     "Java attach tool (%s) generate failed.\n",
-			     JAVA_ATTACH_TOOL_PATH);
-		return (-1);
-	}
-
-	if (chmod(JAVA_ATTACH_TOOL_PATH, 0755) < 0) {
-		ebpf_warning(LOG_CP_TAG
-			     "file '%s' chmod failed.\n",
-			     JAVA_ATTACH_TOOL_PATH);
+	// Java agent so library generation and tools install.
+	if (java_libs_and_tools_install() != 0)
 		return (-1);
-	}
 
 	snprintf(bpf_load_buffer_name, NAME_LEN, "continuous_profiler");
 	bpf_bin_buffer = (void *)perf_profiler_common_ebpf_data;
 	buffer_sz = sizeof(perf_profiler_common_ebpf_data);
 
+	struct tracer_probes_conf *tps =
+	    malloc(sizeof(struct tracer_probes_conf));
+	if (tps == NULL) {
+		ebpf_warning("malloc() error.\n");
+		return -ENOMEM;
+	}
+	memset(tps, 0, sizeof(*tps));
+	init_list_head(&tps->uprobe_syms_head);
+	CP_PROFILE_SET_PROBES(tps);
+
 	struct bpf_tracer *tracer =
 	    setup_bpf_tracer(CP_TRACER_NAME, bpf_load_buffer_name,
-			     bpf_bin_buffer, buffer_sz, NULL, 0,
+			     bpf_bin_buffer, buffer_sz, tps, 0,
 			     release_profiler, create_profiler,
 			     (void *)callback, freq);
 	if (tracer == NULL)
@@ -1554,7 +648,7 @@ void release_flame_graph_hash(void)
 		  "<<< stack_count %lu add_count %lu hit_count %lu msg_ptr_zero"
 		  "_count %lu push_count %lu >>>\n", stack_count,
 		  test_add_count, test_hit_count, msg_ptr_zero_count,
-		  push_count);
+		  oncpu_ctx.push_count);
 
 	ebpf_info(LOG_CP_TAG
 		  "Please use the following command to generate a flame graph:"
@@ -1578,42 +672,21 @@ int set_profiler_regex(const char *pattern)
 		return (-1);
 	}
 
+	if (!g_enable_oncpu) {
+		ebpf_warning(LOG_CP_TAG
+			     "'profiler_regex' cannot be set while on-CPU is currently disabled.\n");
+		return (-1);
+	}
+
 	/*
 	 * During the data processing, the thread responsible for matching reads the
 	 * regular expression, while the thread handling the regular expression upd-
 	 * ates is different. Synchronization is implemented to ensure protection and
 	 * coordination between these two threads.
 	 */
-	tracer_reader_lock(profiler_tracer);
-	if (*pattern == '\0') {
-		regex_existed = false;
-		ebpf_warning(LOG_CP_TAG
-			     "Set 'profiler_regex' pattern : '', an empty"
-			     " regular expression will not generate any stack data."
-			     "Please configure the regular expression for profiler.\n");
-		tracer_reader_unlock(profiler_tracer);
-		return (0);
-	}
-
-	if (regex_existed) {
-		regfree(&profiler_regex);
-	}
-
-	int ret = regcomp(&profiler_regex, pattern, REG_EXTENDED);
-	if (ret != 0) {
-		char error_buffer[100];
-		regerror(ret, &profiler_regex, error_buffer,
-			 sizeof(error_buffer));
-		ebpf_warning(LOG_CP_TAG
-			     "Pattern %s failed to compile the regular "
-			     "expression: %s\n", pattern, error_buffer);
-		regex_existed = false;
-		tracer_reader_unlock(profiler_tracer);
-		return (-1);
-	}
-
-	regex_existed = true;
-	tracer_reader_unlock(profiler_tracer);
+	profile_regex_lock(&oncpu_ctx);
+	do_profiler_regex_config(pattern, &oncpu_ctx);
+	profile_regex_unlock(&oncpu_ctx);
 	ebpf_info(LOG_CP_TAG "Set 'profiler_regex' successful, pattern : '%s'",
 		  pattern);
 	return (0);
@@ -1627,7 +700,7 @@ int set_profiler_cpu_aggregation(int flag)
 		return (-1);
 	}
 
-	cpu_aggregation_flag = (u64) flag;
+	oncpu_ctx.cpu_aggregation_flag = (u64) flag;
 
 	ebpf_info(LOG_CP_TAG
 		  "Set 'cpu_aggregation_flag' successful, value %d\n", flag);
@@ -1676,6 +749,20 @@ int cpdbg_set_config(int timeout, debug_callback_t cb)
 	return 0;
 }
 
+int enable_oncpu_profiler(void)
+{
+	g_enable_oncpu = true;
+	ebpf_info(LOG_CP_TAG "Set oncpu profiler enable.\n");
+	return 0;
+}
+
+int disable_oncpu_profiler(void)
+{
+	g_enable_oncpu = false;
+	ebpf_info(LOG_CP_TAG "Set oncpu profiler distable.\n");
+	return 0;
+}
+
 #else /* defined AARCH64_MUSL */
 #include "../tracer.h"
 #include "perf_profiler.h"
@@ -1718,7 +805,8 @@ struct bpf_tracer *get_profiler_tracer(void)
 	return NULL;
 }
 
-void set_enable_perf_sample(struct bpf_tracer *t, u64 enable_flag)
+void set_bpf_run_enabled(struct bpf_tracer *t, struct profiler_context *ctx,
+			 u64 enable_flag)
 {
 }
 
@@ -1726,4 +814,14 @@ int cpdbg_set_config(int timeout, debug_callback_t cb)
 {
 }
 
+int enable_oncpu_profiler(void)
+{
+	return 0;
+}
+
+int disable_oncpu_profiler(void)
+{
+	return 0;
+}
+
 #endif /* AARCH64_MUSL */
diff --git a/agent/src/ebpf/user/profile/perf_profiler.h b/agent/src/ebpf/user/profile/perf_profiler.h
index 76551a57ec2..75176329ad1 100644
--- a/agent/src/ebpf/user/profile/perf_profiler.h
+++ b/agent/src/ebpf/user/profile/perf_profiler.h
@@ -16,6 +16,8 @@
 
 #ifndef DF_USER_PERF_PROFILER_H
 #define DF_USER_PERF_PROFILER_H
+#define CP_PROFILE_SET_PROBES(T)
+#include "extended/extended.h"
 #include "../bihash_24_8.h"
 #include "../../kernel/include/perf_profiler.h"
 
@@ -49,22 +51,20 @@ typedef struct {
 	union {
 		struct {
 			/*
-		 	 * tgid:(max 67,108,864)
-		 	 *   The tgid (Thread Group ID) in kernel space
-		 	 *   is equivalent to the process ID in user space.
-		 	 * pid:(max 67,108,864)
-		 	 *   The process ID or thread ID in kernel space.
-		 	 * cpu: (max 4,096)
-		 	 *   Which CPU core does the perf event occur on?
-		 	 */
-			u64 tgid: 26,
-			    pid: 26, 
-			    cpu: 12;
+			 * tgid:(max 67,108,864)
+			 *   The tgid (Thread Group ID) in kernel space
+			 *   is equivalent to the process ID in user space.
+			 * pid:(max 67,108,864)
+			 *   The process ID or thread ID in kernel space.
+			 * cpu: (max 4,096)
+			 *   Which CPU core does the perf event occur on?
+			 */
+			u64 tgid:26, pid:26, cpu:12;
 
 			/*
 			 * process start time(the number of millisecond
 			 * elapsed since January 1, 1970 00:00:00).
- 			 */
+			 */
 			u64 stime;
 			u32 u_stack_id;
 			u32 k_stack_id;
@@ -73,19 +73,25 @@ typedef struct {
 		/* Matching and combining for process/thread name. */
 		struct {
 			u8 comm[TASK_COMM_LEN];
-			u64 pid: 26,
-			    reserved: 26,
-			    cpu: 12;
+			u64 pid:26, reserved:26, cpu:12;
 		} c_k;
 	};
 
 	/* Store perf profiler data */
 	uword msg_ptr;
-} stack_trace_msg_kv_t; 
+} stack_trace_msg_kv_t;
+
+enum {
+	PROFILER_TYPE_UNKNOWN,
+	PROFILER_TYPE_ONCPU,
+	PROFILER_TYPE_NUM,
+};
 
 /*
  * stack trace message value, push data
  *
+ * @profiler_type
+ *   Profiler type, such as PROFILER_TYPE_ONCPU.
  * @time_stamp
  *   Timestamp of the stack trace data(unit: nanoseconds).
  * @pid
@@ -106,6 +112,10 @@ typedef struct {
  *   The profiler captures the number of occurrences of the same
  *   data by querying with the quadruple
  *   "<pid + stime + u_stack_id + k_stack_id + tid + cpu>" as the key.
+ *     1.1936h
+ *   In the sampling scenario, the number of samples is used; in the
+ *   non-sampling scenario, real-time intervals (in Microseconds) are
+ *   used. Range: [1, 2^32-1)us
  * @comm
  *   comm in task_struct(linux kernel), always 16 bytes
  *   If the capture is a process, fill in the process name here.
@@ -125,6 +135,7 @@ typedef struct {
  *   <user space folded stack trace string> + ";" + <kernel space folded stack trace string>
  */
 typedef struct {
+	u8 profiler_type;
 	u64 time_stamp;
 	u32 pid;
 	u32 tid;
@@ -151,10 +162,11 @@ int stop_continuous_profiler(void);
 int start_continuous_profiler(int freq, int java_syms_space_limit,
 			      int java_syms_update_delay,
 			      tracer_callback_t callback);
-void process_stack_trace_data_for_flame_graph(stack_trace_msg_t *val);
+void process_stack_trace_data_for_flame_graph(stack_trace_msg_t * val);
 void release_flame_graph_hash(void);
 int set_profiler_regex(const char *pattern);
 int set_profiler_cpu_aggregation(int flag);
 struct bpf_tracer *get_profiler_tracer(void);
 void set_enable_perf_sample(struct bpf_tracer *t, u64 enable_flag);
+void cpdbg_process(stack_trace_msg_t * msg);
 #endif /* DF_USER_PERF_PROFILER_H */
diff --git a/agent/src/ebpf/user/profile/profile_common.c b/agent/src/ebpf/user/profile/profile_common.c
new file mode 100644
index 00000000000..08f502d43e4
--- /dev/null
+++ b/agent/src/ebpf/user/profile/profile_common.c
@@ -0,0 +1,1067 @@
+/*
+ * Copyright (c) 2024 Yunshan Networks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * View kernel addresses exposed via /proc and other interfaces
+ * when /proc/sys/kernel/kptr_restrict has the value 1, it is
+ * necessary to set the CAP_SYSLOG capability, otherwise all k-
+ * ernel addresses are set to 0.
+ *
+ * This function is used to check if the kernel address is 0.
+ */
+
+#include <sys/stat.h>
+#include <bcc/perf_reader.h>
+#include "../config.h"
+#include "../utils.h"
+#include "../common.h"
+#include "../mem.h"
+#include "../log.h"
+#include "../types.h"
+#include "../vec.h"
+#include "../tracer.h"
+#include "../socket.h"
+#include "java/gen_syms_file.h"
+#include "perf_profiler.h"
+#include "../elf.h"
+#include "../load.h"
+#include "../../kernel/include/perf_profiler.h"
+#include "../perf_reader.h"
+#include "../bihash_8_8.h"
+#include "stringifier.h"
+#include "../table.h"
+#include <regex.h>
+#include "java/config.h"
+#include "java/df_jattach.h"
+#include "profile_common.h"
+
+/*
+ * This section is for symbolization of Java addresses, and we need
+ * to prepare two so librarys, one for GNU and the other for MUSL:
+ *
+ * df_java_agent.so
+ * df_java_agent_musl.so
+ *
+ * These two files need to be saved in the '/tmp' directory, and the
+ * agent so library will be injected into the JVM to generate
+ * 'perf-<pid>.map'.
+ */
+#include "java_agent_so_gnu.c"
+#include "java_agent_so_musl.c"
+/* use for java symbols generate */
+#include "deepflow_jattach_bin.c"
+
+extern struct bpf_tracer *profiler_tracer;
+extern char *flame_graph_start_time;
+extern int major, minor;
+
+static bool java_installed;
+
+int profiler_context_init(struct profiler_context *ctx,
+			  const char *tag, u8 type,
+			  bool enable_profiler,
+			  const char *state_map_name,
+			  const char *stack_map_name_a,
+			  const char *stack_map_name_b,
+			  bool only_matched, bool use_delta_time)
+{
+	memset(ctx, 0, sizeof(struct profiler_context));
+	ctx->tag = tag;
+	atomic64_init(&ctx->process_lost_count);
+	if (!enable_profiler)
+		ctx->profiler_stop = 1;
+
+	snprintf(ctx->state_map_name, sizeof(ctx->state_map_name), "%s",
+		 state_map_name);
+	snprintf(ctx->stack_map_name_a, sizeof(ctx->stack_map_name_a), "%s",
+		 stack_map_name_a);
+	snprintf(ctx->stack_map_name_b, sizeof(ctx->stack_map_name_b), "%s",
+		 stack_map_name_b);
+	ctx->regex_existed = false;
+	ctx->only_matched_data = only_matched;
+	ctx->use_delta_time = use_delta_time;
+	ctx->type = type;
+
+	return 0;
+}
+
+void set_bpf_run_enabled(struct bpf_tracer *t, struct profiler_context *ctx,
+			 u64 enable_flag)
+{
+	if (ctx->profiler_stop == 1)
+		return;
+
+	if (bpf_table_set_value(t, ctx->state_map_name,
+				ENABLE_IDX, &enable_flag) == false) {
+		ebpf_warning("%sprofiler state map update error."
+			     "(%s enable_flag %lu) - %s\n",
+			     ctx->tag, ctx->state_map_name, enable_flag,
+			     strerror(errno));
+		return;
+	}
+
+	ctx->enable_bpf_profile = enable_flag;
+
+	ebpf_info("%s%s() success, enable_flag:%d\n", ctx->tag, __func__,
+		  enable_flag);
+}
+
+int do_profiler_regex_config(const char *pattern, struct profiler_context *ctx)
+{
+	if (*pattern == '\0') {
+		ctx->regex_existed = false;
+		ebpf_warning("%sSet 'profiler_regex' pattern : '', an empty"
+			     " regular expression will not generate any stack data."
+			     "Please configure the regular expression for profiler.\n",
+			     ctx->tag);
+		return (0);
+	}
+
+	if (ctx->regex_existed) {
+		regfree(&ctx->profiler_regex);
+	}
+
+	int ret = regcomp(&ctx->profiler_regex, pattern, REG_EXTENDED);
+	if (ret != 0) {
+		char error_buffer[100];
+		regerror(ret, &ctx->profiler_regex, error_buffer,
+			 sizeof(error_buffer));
+		ebpf_warning("%sPattern %s failed to compile the regular "
+			     "expression: %s\n", ctx->tag, pattern,
+			     error_buffer);
+		ctx->regex_existed = false;
+		return (-1);
+	}
+
+	ctx->regex_existed = true;
+	return 0;
+}
+
+static bool check_kallsyms_addr_is_zero(void)
+{
+	const int check_num = 100;
+	const int max_line_len = 256;
+	const char *check_str = "0000000000000000";
+
+	FILE *file = fopen("/proc/kallsyms", "r");
+	if (file == NULL) {
+		ebpf_warning("Error opening /proc/kallsyms");
+		return false;
+	}
+
+	char line[max_line_len];
+	int count = 0;
+
+	while (fgets(line, sizeof(line), file) != NULL && count < check_num) {
+		char address[17];	// 16 characters + null terminator
+		sscanf(line, "%16s", address);
+
+		if (strcmp(address, check_str) == 0) {
+			count++;
+		}
+	}
+
+	fclose(file);
+
+	return (count == check_num);
+}
+
+bool run_conditions_check(void)
+{
+	// REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
+	if (check_kernel_version(4, 9) != 0) {
+		ebpf_warning
+		    ("Currnet linux %d.%d, not support, require Linux 4.9+\n",
+		     major, minor);
+
+		return false;
+	}
+
+	if (check_kallsyms_addr_is_zero()) {
+		ebpf_warning
+		    ("All kernel addresses in /proc/kallsyms are 0, Please"
+		     " follow the steps below to resolve:\n"
+		     "1 Make sure the content of the '/proc/sys/kernel/kpt"
+		     "r_restrict' file is not 2, if it is 2 please set it "
+		     "to 1.\n2 Add 'CAP_SYSLOG' permission to the containe"
+		     "r.\n3 Restart the pod.");
+		return false;
+	}
+
+	return true;
+}
+
+int java_libs_and_tools_install(void)
+{
+	if (java_installed)
+		return (0);
+
+	// Java agent so library generation.
+	if (access(AGENT_LIB_SRC_PATH, F_OK) == 0) {
+		if (unlink(AGENT_LIB_SRC_PATH) != 0) {
+			ebpf_warning("rm file %s failed.\n",
+				     AGENT_LIB_SRC_PATH);
+			return (-1);
+		}
+	}
+
+	if (access(AGENT_MUSL_LIB_SRC_PATH, F_OK) == 0) {
+		if (unlink(AGENT_MUSL_LIB_SRC_PATH) != 0) {
+			ebpf_warning("rm file %s failed.\n",
+				     AGENT_MUSL_LIB_SRC_PATH);
+			return (-1);
+		}
+	}
+
+	if (gen_file_from_mem((const char *)java_agent_so_gnu,
+			      sizeof(java_agent_so_gnu),
+			      (const char *)AGENT_LIB_SRC_PATH)) {
+		ebpf_warning("Java agent so library(%s) generate failed.\n",
+			     AGENT_LIB_SRC_PATH);
+		return (-1);
+	}
+
+	if (gen_file_from_mem((const char *)java_agent_so_musl,
+			      sizeof(java_agent_so_musl),
+			      (const char *)AGENT_MUSL_LIB_SRC_PATH)) {
+		ebpf_warning("Java agent so library(%s) generate failed.\n",
+			     AGENT_MUSL_LIB_SRC_PATH);
+		return (-1);
+	}
+
+	/* For java attach tool */
+	if (access(JAVA_ATTACH_TOOL_PATH, F_OK) == 0) {
+		if (unlink(JAVA_ATTACH_TOOL_PATH) != 0) {
+			ebpf_warning("rm file %s failed.\n",
+				     JAVA_ATTACH_TOOL_PATH);
+			return (-1);
+		}
+	}
+
+	if (gen_file_from_mem((const char *)deepflow_jattach_bin,
+			      sizeof(deepflow_jattach_bin),
+			      (const char *)JAVA_ATTACH_TOOL_PATH)) {
+		ebpf_warning("Java attach tool (%s) generate failed.\n",
+			     JAVA_ATTACH_TOOL_PATH);
+		return (-1);
+	}
+
+	if (chmod(JAVA_ATTACH_TOOL_PATH, 0755) < 0) {
+		ebpf_warning("file '%s' chmod failed.\n",
+			     JAVA_ATTACH_TOOL_PATH);
+		return (-1);
+	}
+
+	java_installed = true;
+
+	return (0);
+}
+
+static u32 delete_all_stackmap_elems(struct bpf_tracer *tracer,
+				     const char *stack_map_name)
+{
+	struct ebpf_map *map =
+	    ebpf_obj__get_map_by_name(tracer->obj, stack_map_name);
+	if (map == NULL) {
+		ebpf_warning("[%s] map(name:%s) is NULL.\n", __func__,
+			     stack_map_name);
+		return 0;
+	}
+	int map_fd = map->fd;
+
+	u32 key = 0, next_key;
+	u32 reclaim_count = 0;
+	u32 find_count = 0;
+	struct list_head clear_elem_head;
+	init_list_head(&clear_elem_head);
+
+	while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+		find_count++;
+		insert_list(&next_key, sizeof(next_key), &clear_elem_head);
+		key = next_key;
+	}
+
+	reclaim_count = __reclaim_map(map_fd, &clear_elem_head);
+
+	ebpf_info("[%s] table %s find_count %u reclaim_count :%u\n",
+		  __func__, stack_map_name, find_count, reclaim_count);
+
+	return reclaim_count;
+}
+
+static void cleanup_stackmap(struct profiler_context *ctx, struct bpf_tracer *t,
+			     const char *stack_map_name, bool is_a)
+{
+	struct stack_ids_bitmap *ids;
+	int *clear_stack_ids;
+	u64 *perf_buf_lost_p = NULL;
+
+	if (is_a) {
+		ids = &ctx->stack_ids_a;
+		clear_stack_ids = ctx->clear_stack_ids_a;
+		perf_buf_lost_p = &ctx->perf_buf_lost_a_count;
+	} else {
+		ids = &ctx->stack_ids_b;
+		clear_stack_ids = ctx->clear_stack_ids_b;
+		perf_buf_lost_p = &ctx->perf_buf_lost_b_count;
+	}
+
+	if (ids->count != vec_len(clear_stack_ids)) {
+		ebpf_warning
+		    ("%sstack_ids.count(%lu) != vec_len(clear_stack_ids)(%d)",
+		     ctx->tag, ids->count, vec_len(clear_stack_ids));
+	}
+
+	/*
+	 * The perf profiler utilizes a perf buffer (per CPUs) for transporting stack data,
+	 * which may lead to out-of-order behavior in a multi-core environment.
+	 * We have employed a threshold to delay the cleanup of the stack map, reducing the
+	 * occurrence of premature clearing of stack entries caused by the disorder in stack
+	 * data.
+	 *
+	 * Examine the detailed explanation of 'STACKMAP_CLEANUP_THRESHOLD' in
+	 * 'agent/src/ebpf/user/config.h'.
+	 */
+	if (ids->count >= STACKMAP_CLEANUP_THRESHOLD) {
+		int *sid;
+		vec_foreach(sid, clear_stack_ids) {
+			int id = *sid;
+			if (!bpf_table_delete_key(t, stack_map_name, (u64) id)) {
+				/*
+				 * It may be due to the disorder in the perf buffer transmission,
+				 * leading to the repetitive deletion of the same stack ID.
+				 */
+				ctx->stackmap_clear_failed_count++;
+			}
+
+			clear_bitmap(ids->bitmap, id);
+		}
+
+		if (is_a)
+			vec_free(ctx->clear_stack_ids_a);
+		else
+			vec_free(ctx->clear_stack_ids_b);
+
+		ids->count = 0;
+
+		/*
+		 * If data loss occurs due to the user-space receiver program
+		 * being too busy and not promptly fetching data from the perf
+		 * buffer, it is necessary to clean the stack map once to prevent
+		 * excessive remnants of stack data from affecting the acquisition
+		 * of new stack data (i.e., eBPF using the bpf_get_stackid()
+		 * interface will return -EEXIST).
+		 */
+		if (*perf_buf_lost_p > 0) {
+			delete_all_stackmap_elems(t, stack_map_name);
+			*perf_buf_lost_p = 0;
+		}
+	}
+}
+
+static void print_profiler_status(struct profiler_context *ctx,
+				  struct bpf_tracer *t, u64 iter_count)
+{
+	u64 alloc_b, free_b;
+	get_mem_stat(&alloc_b, &free_b);
+	ebpf_debug("\n\n----------------------------\n%srecv envent:\t%lu\n"
+		   "kern_lost:\t%lu, perf_buf_lost_a:\t%lu, perf_buf_lost_b:\t%lu\n"
+		   "stack_trace_err:\t%lu\n"
+		   "stackmap_clear_failed_count\t%lu\n"
+		   "ransfer_count:\t%lu iter_count:\t%lu\nall"
+		   "oc_b:\t%lu bytes free_b:\t%lu bytes use:\t%lu bytes\n"
+		   "stack_str_hash.hit_count %lu\nstack_trace_msg_hash hit %lu\n",
+		   ctx->tag, atomic64_read(&t->recv), atomic64_read(&t->lost),
+		   ctx->perf_buf_lost_a_count, ctx->perf_buf_lost_b_count,
+		   ctx->stack_trace_err, ctx->stackmap_clear_failed_count,
+		   ctx->transfer_count, iter_count,
+		   alloc_b, free_b, alloc_b - free_b,
+		   ctx->stack_str_hash.hit_hash_count,
+		   ctx->msg_hash.hit_hash_count);
+}
+
+static int push_and_free_msg_kvp_cb(stack_trace_msg_hash_kv * kv, void *arg)
+{
+	struct profiler_context *ctx = arg;
+	stack_trace_msg_kv_t *msg_kv = (stack_trace_msg_kv_t *) kv;
+	if (msg_kv->msg_ptr != 0) {
+		stack_trace_msg_t *msg = (stack_trace_msg_t *) msg_kv->msg_ptr;
+
+		/* continuous profiler debug */
+		cpdbg_process(msg);
+
+		tracer_callback_t fun = profiler_tracer->process_fn;
+		/*
+		 * Execute callback function to hand over the data to the
+		 * higher level for processing. The higher level will se-
+		 * nd the data to the server for storage as required.
+		 */
+		if (likely(ctx->profiler_stop == 0))
+			fun(msg);
+
+		clib_mem_free((void *)msg);
+		msg_kv->msg_ptr = 0;
+	}
+
+	int ret = VEC_OK;
+	vec_add1(ctx->trace_msg_kvps, *kv, ret);
+	if (ret != VEC_OK) {
+		ebpf_warning("vec add failed\n");
+		ctx->msg_clear_hash = true;
+	}
+
+	return BIHASH_WALK_CONTINUE;
+}
+
+/*
+ * Push the data and release the resources.
+ * @is_force: Do you need to perform a forced release?
+ */
+void push_and_release_stack_trace_msg(struct profiler_context *ctx,
+				      stack_trace_msg_hash_t * h, bool is_force)
+{
+	ASSERT(profiler_tracer != NULL);
+
+	u64 curr_time, elapsed;
+	curr_time = gettime(CLOCK_MONOTONIC, TIME_TYPE_NAN);
+	elapsed = curr_time - ctx->last_push_time;
+
+	/*
+	 * If the aggregated stack trace data obtained by the profiler
+	 * satisfies one of the following conditions, it should be pushed
+	 * to the upper-level processing:
+	 *
+	 *   If the time interval since the last push exceeds or equals
+	 *   the maximum time interval (MAX_PUSH_MSG_TIME_INTERVAL).
+	 *
+	 * Otherwise, it should return directly.
+	 */
+	if (!((elapsed >= MAX_PUSH_MSG_TIME_INTERVAL) || is_force))
+		return;
+
+	/* update last push time. */
+	ctx->last_push_time = curr_time;
+	ctx->push_count++;
+
+	stack_trace_msg_hash_foreach_key_value_pair(h, push_and_free_msg_kvp_cb,
+						    (void *)ctx);
+	/*
+	 * In this iteration, all elements will be cleared, and in the
+	 * next iteration, this hash will be reused.
+	 */
+	stack_trace_msg_hash_kv *v;
+	vec_foreach(v, ctx->trace_msg_kvps) {
+		if (stack_trace_msg_hash_add_del(h, v, 0 /* delete */ )) {
+			ebpf_warning
+			    ("%sstack_trace_msg_hash_add_del() failed.\n",
+			     ctx->tag);
+			ctx->msg_clear_hash = true;
+		}
+	}
+
+	vec_free(ctx->trace_msg_kvps);
+
+	h->hit_hash_count = 0;
+	h->hash_elems_count = 0;
+
+	if (ctx->msg_clear_hash) {
+		ctx->msg_clear_hash = false;
+		stack_trace_msg_hash_free(h);
+	}
+}
+
+static int init_stack_trace_msg_hash(stack_trace_msg_hash_t * h,
+				     const char *name)
+{
+	memset(h, 0, sizeof(*h));
+	u32 nbuckets = STACK_TRACE_MSG_HASH_BUCKETS_NUM;
+	u64 hash_memory_size = STACK_TRACE_MSG_HASH_MEM_SZ;
+	return stack_trace_msg_hash_init(h, (char *)name,
+					 nbuckets, hash_memory_size);
+}
+
+static void add_stack_id_to_bitmap(struct profiler_context *ctx,
+				   int stack_id, bool is_a)
+{
+	if (stack_id < 0)
+		return;
+
+	struct stack_ids_bitmap *ids;
+	if (is_a)
+		ids = &ctx->stack_ids_a;
+	else
+		ids = &ctx->stack_ids_b;
+
+	if (!is_set_bitmap(ids->bitmap, stack_id)) {
+		set_bitmap(ids->bitmap, stack_id);
+		int ret = VEC_OK;
+
+		if (is_a)
+			vec_add1(ctx->clear_stack_ids_a, stack_id, ret);
+		else
+			vec_add1(ctx->clear_stack_ids_b, stack_id, ret);
+
+		if (ret != VEC_OK) {
+			ebpf_warning("%svec add failed\n", ctx->tag);
+		}
+
+		ids->count++;
+	}
+}
+
+/*
+ * The invocation of this interface is always when the process name does
+ * not match.
+ */
+static void set_msg_kvp_by_comm(stack_trace_msg_kv_t * kvp,
+				struct stack_trace_key_t *v, void *msg_value)
+{
+	strcpy_s_inline(kvp->c_k.comm, sizeof(kvp->c_k.comm),
+			v->comm, strlen(v->comm));
+	kvp->c_k.cpu = v->cpu;
+	kvp->c_k.pid = v->tgid;
+	kvp->c_k.reserved = 0;
+	kvp->msg_ptr = pointer_to_uword(msg_value);
+}
+
+static void set_msg_kvp(stack_trace_msg_kv_t * kvp,
+			struct stack_trace_key_t *v, u64 stime, void *msg_value)
+{
+	kvp->k.tgid = v->tgid;
+	kvp->k.pid = v->pid;
+	kvp->k.stime = stime;
+	kvp->k.cpu = v->cpu;
+	kvp->k.u_stack_id = (u32) v->userstack;
+	kvp->k.k_stack_id = (u32) v->kernstack;
+	kvp->msg_ptr = pointer_to_uword(msg_value);
+}
+
+static void set_stack_trace_msg(struct profiler_context *ctx,
+				stack_trace_msg_t * msg,
+				struct stack_trace_key_t *v,
+				bool matched,
+				u64 stime,
+				u64 ns_id,
+				const char *process_name,
+				const char *container_id)
+{
+	msg->pid = v->tgid;
+	msg->tid = v->pid;
+	msg->cpu = v->cpu;
+	msg->u_stack_id = (u32) v->userstack;
+	msg->k_stack_id = (u32) v->kernstack;
+	strcpy_s_inline(msg->comm, sizeof(msg->comm), v->comm, strlen(v->comm));
+	msg->stime = stime;
+	msg->netns_id = ns_id;
+	msg->profiler_type = ctx->type;
+	if (container_id != NULL) {
+		strcpy_s_inline(msg->container_id, sizeof(msg->container_id),
+				container_id, strlen(container_id));
+	}
+
+	if (stime > 0) {
+		/*
+		 * Note: There is no process with PID 0 in procfs.
+		 * If the PID is 0, it will return the kernel's
+		 * startup time, and the process name will be
+		 * obtained from data retrieved through eBPF.
+		 */
+		if (msg->pid == 0) {
+			memcpy(msg->process_name, v->comm, sizeof(msg->comm));
+		} else {
+			if (process_name != NULL) {
+				strcpy_s_inline(msg->process_name,
+						sizeof(msg->process_name),
+						process_name,
+						strlen(process_name));
+			}
+		}
+
+	} else {
+
+		/*
+		 * If the process has already exited, then execution reaches
+		 * this point, which means aggregating data based on the
+		 * process name.
+		 */
+		strcpy_s_inline(msg->process_name, sizeof(msg->process_name),
+				v->comm, strlen(v->comm));
+		atomic64_inc(&ctx->process_lost_count);
+	}
+
+	if (!matched || stime <= 0) {
+		/* The aggregation method is identified as
+		 * { process name + [u,k]stack_trace_id + cpu} */
+		msg->stime = 0;
+		if (!matched) {
+			msg->pid = msg->tid = 0;
+			snprintf((char *)msg->process_name,
+				 sizeof(msg->process_name), "%s", "Total");
+		}
+	}
+
+	msg->time_stamp = gettime(CLOCK_REALTIME, TIME_TYPE_NAN);
+	if (ctx->use_delta_time)
+		// Using microseconds for storage.
+		msg->count = v->duration_ns / 1000;
+	else
+		msg->count = 1;
+	msg->data_ptr = pointer_to_uword(&msg->data[0]);
+
+	/* Only use for test flame graph. */
+	if (flame_graph_start_time == NULL) {
+		flame_graph_start_time = gen_file_name_by_datetime();
+	}
+}
+
+static inline stack_trace_msg_t *alloc_stack_trace_msg(int len)
+{
+	void *trace_msg;
+	trace_msg = clib_mem_alloc_aligned("stack_msg", len, 0, NULL);
+	if (trace_msg == NULL) {
+		ebpf_warning("stack trace msg alloc memory failed.\n");
+	} else {
+		stack_trace_msg_t *msg = trace_msg;
+		return msg;
+	}
+
+	return NULL;
+}
+
+static inline void update_matched_process_in_total(struct profiler_context *ctx,
+						   stack_trace_msg_hash_t *
+						   msg_hash,
+						   char *process_name,
+						   struct stack_trace_key_t *v)
+{
+	stack_trace_msg_kv_t kv;
+	set_msg_kvp_by_comm(&kv, v, (void *)0);
+
+	if (stack_trace_msg_hash_search
+	    (msg_hash, (stack_trace_msg_hash_kv *) & kv,
+	     (stack_trace_msg_hash_kv *) & kv) == 0) {
+		__sync_fetch_and_add(&msg_hash->hit_hash_count, 1);
+		((stack_trace_msg_t *) kv.msg_ptr)->count++;
+		return;
+	}
+
+	/* append ';' '\0' and '[p/t]' */
+	char trace_str[(TASK_COMM_LEN * 2) + 10];
+	bool is_thread = (v->pid != v->tgid);
+	if (is_thread)
+		snprintf(trace_str, sizeof(trace_str), "[p] %s;[t] %s",
+			 process_name, v->comm);
+	else
+		snprintf(trace_str, sizeof(trace_str), "[p] %s", process_name);
+
+	/* append 2 byte for ';''\0' */
+	int len = sizeof(stack_trace_msg_t) + strlen(trace_str) + 2;
+	stack_trace_msg_t *msg = alloc_stack_trace_msg(len);
+	if (msg == NULL) {
+		clib_mem_free(trace_str);
+		return;
+	}
+
+	set_stack_trace_msg(ctx, msg, v, false, 0, 0, process_name, NULL);
+	snprintf((char *)&msg->data[0], strlen(trace_str) + 2, "%s", trace_str);
+	msg->data_len = strlen((char *)msg->data);
+	kv.msg_ptr = pointer_to_uword(msg);
+
+	if (stack_trace_msg_hash_add_del(msg_hash,
+					 (stack_trace_msg_hash_kv
+					  *) & kv, 1 /* is_add */ )) {
+		ebpf_warning("%sstack_trace_msg_hash_add_del() failed.\n",
+			     ctx->tag);
+		clib_mem_free(msg);
+	} else {
+		__sync_fetch_and_add(&msg_hash->hash_elems_count, 1);
+	}
+}
+
+static void aggregate_stack_traces(struct profiler_context *ctx,
+				   struct bpf_tracer *t,
+				   const char *stack_map_name,
+				   stack_str_hash_t * stack_str_hash,
+				   stack_trace_msg_hash_t * msg_hash,
+				   u32 * count, bool use_a_map)
+{
+	struct stack_trace_key_t *v;
+	vec_foreach(v, ctx->raw_stack_data) {
+		if (v == NULL)
+			break;
+
+		if (unlikely(ctx->profiler_stop == 1))
+			break;
+
+		/*
+		 * If cpu_aggregation_flag=0, the CPU value for stack trace data
+		 * reporting is a special value (CPU_INVALID:0xfff) used to indicate
+		 * that it is an invalid value, the  CPUID will not be included in
+		 * the aggregation.
+		 */
+		if (ctx->cpu_aggregation_flag == 0)
+			v->cpu = CPU_INVALID;
+
+		/*
+		 * Uniform idle process names to reduce the aggregated count of stack
+		 * trace data (when we aggregate using process names as part of the key).
+		 * "swapper/0", "swapper/1", "swapper/2" ... > "swapper"
+		 */
+		if (v->pid == v->tgid && v->pid == 0) {
+			const char *idle_name = "swapper";
+			strcpy_s_inline(v->comm, sizeof(v->comm),
+					idle_name, strlen(idle_name));
+		}
+
+		/* -EEXIST: Hash bucket collision in the stack trace table */
+		if (v->kernstack == -EEXIST)
+			ctx->stack_trace_err++;
+
+		if (v->userstack == -EEXIST)
+			ctx->stack_trace_err++;
+
+		add_stack_id_to_bitmap(ctx, v->kernstack, use_a_map);
+		add_stack_id_to_bitmap(ctx, v->userstack, use_a_map);
+
+		/* Total iteration count for this iteration. */
+		(*count)++;
+
+		/* Total iteration count for all iterations. */
+		ctx->process_count++;
+
+		/*
+		 * Firstly, search the stack-trace-msg hash to see if the
+		 * stack trace messages has already been stored.
+		 */
+		stack_trace_msg_kv_t kv;
+		char name[TASK_COMM_LEN];
+		memset(name, 0, sizeof(name));
+		u64 stime, netns_id;
+		stime = netns_id = 0;
+		void *info_p = NULL;
+		struct symbolizer_proc_info *__info_p = NULL;
+		char *process_name = NULL;
+		bool matched, is_match_finish;
+		matched = is_match_finish = false;
+
+		/* If it is a process, match operation will be performed immediately. */
+		if (v->pid == v->tgid) {
+			is_match_finish = true;
+			profile_regex_lock(ctx);
+			matched =
+			    (regexec(&ctx->profiler_regex, v->comm, 0, NULL, 0)
+			     == 0);
+			profile_regex_unlock(ctx);
+			if (!matched) {
+				if (ctx->only_matched_data) {
+					continue;
+				}
+
+				set_msg_kvp_by_comm(&kv, v, (void *)0);
+				goto skip_proc_find;
+			}
+		}
+
+		get_process_info_by_pid(v->tgid, &stime, &netns_id,
+					(char *)name, &info_p);
+		__info_p = info_p;
+
+		/*
+		 * If the data collected is from a running process, and the process
+		 * name and the command name of the task (captured by eBPF) are not
+		 * consistent, it indicates that the cached process information is
+		 * no longer valid.
+		 */
+		if (stime > 0 && v->pid == v->tgid && strcmp(name, v->comm)) {
+			stime = netns_id = 0;
+			name[0] = '\0';
+			process_name = NULL;
+			info_p = NULL;
+		}
+
+		if (stime > 0) {
+			if (v->tgid == 0)
+				process_name = v->comm;
+			else
+				process_name = name;
+
+			if (!is_match_finish) {
+				profile_regex_lock(ctx);
+				matched =
+				    (regexec
+				     (&ctx->profiler_regex, process_name, 0,
+				      NULL, 0)
+				     == 0);
+				profile_regex_unlock(ctx);
+			}
+
+			if (matched)
+				set_msg_kvp(&kv, v, stime, (void *)0);
+			else {
+				if (ctx->only_matched_data) {
+					if (__info_p
+					    && AO_SUB_F(&__info_p->use, 1) == 0)
+						free_proc_cache(__info_p);
+
+					continue;
+				}
+
+				set_msg_kvp_by_comm(&kv, v, (void *)0);
+			}
+		} else {
+			if (ctx->only_matched_data) {
+				if (__info_p
+				    && AO_SUB_F(&__info_p->use, 1) == 0)
+					free_proc_cache(__info_p);
+				continue;
+			}
+
+			/* Not find process in procfs. */
+			set_msg_kvp_by_comm(&kv, v, (void *)0);
+		}
+
+		/*
+		 * Here, we duplicate the matched process data and place it into
+		 * the Total process, with the aim of showcasing the proportion
+		 * of each process in the overall sampling.
+		 */
+		if (matched && !ctx->only_matched_data)
+			update_matched_process_in_total(ctx, msg_hash,
+							process_name, v);
+
+	      skip_proc_find:
+		if (stack_trace_msg_hash_search
+		    (msg_hash, (stack_trace_msg_hash_kv *) & kv,
+		     (stack_trace_msg_hash_kv *) & kv) == 0) {
+			__sync_fetch_and_add(&msg_hash->hit_hash_count, 1);
+			((stack_trace_msg_t *) kv.msg_ptr)->count +=
+			    v->duration_ns;
+			if (__info_p && AO_SUB_F(&__info_p->use, 1) == 0)
+				free_proc_cache(__info_p);
+			continue;
+		}
+
+		/*
+		 * Folded stack trace string and generate stack trace messages.
+		 *
+		 * Folded stack trace string (taken from a performance profiler test):
+		 * main;xxx();yyy()
+		 * It is a list of symbols corresponding to addresses in the underlying
+		 * stack trace, separated by ';'.
+		 */
+
+		char *trace_str =
+		    resolve_and_gen_stack_trace_str(t, v, stack_map_name,
+						    stack_str_hash, matched,
+						    process_name, info_p);
+
+		if (trace_str) {
+			/*
+			 * append process/thread name to stack string
+			 * append 2 byte for ';''\0'
+			 * append pre_tag '[p/t]'
+			 */
+			char pre_tag[5];
+			int str_len = strlen(trace_str) + 2;
+			if (matched)
+				str_len += strlen(v->comm) + sizeof(pre_tag);
+
+			int len = sizeof(stack_trace_msg_t) + str_len;
+			stack_trace_msg_t *msg = alloc_stack_trace_msg(len);
+			if (msg == NULL) {
+				clib_mem_free(trace_str);
+				if (__info_p
+				    && AO_SUB_F(&__info_p->use, 1) == 0)
+					free_proc_cache(__info_p);
+
+				continue;
+			}
+
+			memset(msg, 0, len);
+			struct symbolizer_proc_info *__p = info_p;
+			set_stack_trace_msg(ctx, msg, v, matched, stime,
+					    netns_id, process_name,
+					    __p ? __p->container_id : NULL);
+
+			snprintf(pre_tag, sizeof(pre_tag), "%s ",
+				 v->pid == v->tgid ? "[p]" : "[t]");
+			if (matched)
+				snprintf((char *)&msg->data[0], str_len,
+					 "%s%s;%s", pre_tag, v->comm,
+					 trace_str);
+			else
+				snprintf((char *)&msg->data[0], str_len, "%s",
+					 trace_str);
+
+			msg->data_len = strlen((char *)msg->data);
+			clib_mem_free(trace_str);
+			kv.msg_ptr = pointer_to_uword(msg);
+
+			if (stack_trace_msg_hash_add_del(msg_hash,
+							 (stack_trace_msg_hash_kv
+							  *) & kv,
+							 1 /* is_add */ )) {
+				ebpf_warning
+				    ("%sstack_trace_msg_hash_add_del() failed.\n",
+				     ctx->tag);
+				clib_mem_free(msg);
+			} else {
+				__sync_fetch_and_add
+				    (&msg_hash->hash_elems_count, 1);
+			}
+		}
+
+		if (__info_p && AO_SUB_F(&__info_p->use, 1) == 0)
+			free_proc_cache(__info_p);
+
+		/* check and clean symbol cache */
+		exec_proc_info_cache_update();
+	}
+
+	vec_free(ctx->raw_stack_data);
+}
+
+void process_bpf_stacktraces(struct profiler_context *ctx, struct bpf_tracer *t)
+{
+	struct bpf_perf_reader *r;
+	const char *stack_map_name;
+	bool using_map_set_a = (ctx->transfer_count % 2 == 0);
+	r = using_map_set_a ? ctx->r_a : ctx->r_b;
+	stack_map_name =
+	    using_map_set_a ? ctx->stack_map_name_a : ctx->stack_map_name_b;
+	const u64 sample_count_idx =
+	    using_map_set_a ? SAMPLE_CNT_A_IDX : SAMPLE_CNT_B_IDX;
+
+	struct epoll_event events[r->readers_count];
+	int nfds = reader_epoll_wait(r, events, 0);
+
+	ctx->transfer_count++;
+	if (bpf_table_set_value(t, ctx->state_map_name,
+				TRANSFER_CNT_IDX,
+				&ctx->transfer_count) == false) {
+		ebpf_warning("%sprofiler state map update error."
+			     "(%s transfer_count %lu) - %s\n",
+			     ctx->tag, ctx->state_map_name, ctx->transfer_count,
+			     strerror(errno));
+		ctx->transfer_count--;
+	}
+
+	/* Total iteration count for this iteration. */
+	u32 count = 0;
+
+	/* eBPF map record count for this iteration. */
+	u64 sample_cnt_val = 0;
+
+	/*
+	 * Why use g_stack_str_hash?
+	 *
+	 * When the stringizer encounters a stack-ID for the first time in
+	 * the stack trace table, it clears it. If a stack-ID is reused by
+	 * different stack trace keys, the stringizer returns its memoized
+	 * stack trace string. Since stack IDs are unstable between profile
+	 * iterations, we create and destroy the stringizer in each profile
+	 * iteration.
+	 */
+	if (unlikely(ctx->stack_str_hash.buckets == NULL)) {
+		if (init_stack_str_hash
+		    (&ctx->stack_str_hash, "profile_stack_str")) {
+			ebpf_warning("%sinit_stack_str_hash() failed.\n",
+				     ctx->tag);
+			return;
+		}
+	}
+
+	/*
+	 * During each transmission iteration, we have a hashmap structure in
+	 * place for the following purposes:
+	 *
+	 * 1 Pushing the data of this iteration to the higher-level processing.
+	 * 2 Performing data statistics based on the stack trace data, using the
+	 *   combination of "tgid + tgid_start_time + pid + cpu + k_stack_id +
+	 *   u_stack_id + " as the key.
+	 *
+	 * Here is the key-value pair structure of the hashmap:
+	 * see perf_profiler.h (stack_trace_msg_kv_t)
+	 * This is the final form of the data. If the current stack trace message
+	 * is a match, we only need to increment the count field in the correspon-
+	 * ding value, thus avoiding duplicate parsing.
+	 */
+	if (unlikely(ctx->msg_hash.buckets == NULL)) {
+		if (init_stack_trace_msg_hash
+		    (&ctx->msg_hash, "stack_trace_msg")) {
+			ebpf_warning("%sinit_stack_trace_msg_hash() failed.\n",
+				     ctx->tag);
+			return;
+		}
+	}
+
+	if (nfds > 0) {
+
+	      check_again:
+		if (unlikely(ctx->profiler_stop == 1))
+			goto release_iter;
+
+		/* 
+		 * If there is data, the reader's callback
+		 * function will be called.
+		 */
+		reader_event_read(events, nfds);
+
+		/*
+		 * After the reader completes data reading, the work of
+		 * data aggregation will be blocked if there is no data.
+		 */
+		aggregate_stack_traces(ctx, t, stack_map_name,
+				       &ctx->stack_str_hash, &ctx->msg_hash,
+				       &count, using_map_set_a);
+
+		/*
+		 * To ensure that all data in the perf ring-buffer is procenssed
+		 * in this iteration, as this iteration will clean up all the
+		 * data recorded in the stackmap, any residual data in the perf
+		 * ring-buffer will be carried over to the next iteration for
+		 * processing. This poses a risk of not being able to find the
+		 * corresponding stackmap records in the next iteration, leading
+		 * to incomplete processing.
+		 */
+		if (bpf_table_get_value(t, ctx->state_map_name,
+					sample_count_idx,
+					(void *)&sample_cnt_val)) {
+			if (sample_cnt_val > count) {
+				nfds = reader_epoll_short_wait(r, events, 0);
+				if (nfds > 0)
+					goto check_again;
+			}
+		}
+	}
+
+release_iter:
+
+	cleanup_stackmap(ctx, t, stack_map_name, using_map_set_a);
+
+	/* Now that we've consumed the data, reset the sample count in BPF. */
+	sample_cnt_val = 0;
+	bpf_table_set_value(t, ctx->state_map_name,
+			    sample_count_idx, &sample_cnt_val);
+
+	print_profiler_status(ctx, t, count);
+
+	/* free all elems */
+	clean_stack_strs(&ctx->stack_str_hash);
+
+	/* Push messages and free stack_trace_msg_hash */
+	push_and_release_stack_trace_msg(ctx, &ctx->msg_hash, false);
+}
diff --git a/agent/src/ebpf/user/profile/profile_common.h b/agent/src/ebpf/user/profile/profile_common.h
new file mode 100644
index 00000000000..96586dc393d
--- /dev/null
+++ b/agent/src/ebpf/user/profile/profile_common.h
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2024 Yunshan Networks
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DF_USER_PROFILE_COMMON_H
+#define DF_USER_PROFILE_COMMON_H
+
+struct profiler_context {
+	// log output flag string
+	const char *tag;
+	// Profiler type
+	u8 type;
+	// The name of the status map
+	char state_map_name[MAP_NAME_SZ];
+	// The dual-buffered reader is used to read data from the perf buffer.
+	struct bpf_perf_reader *r_a;
+	struct bpf_perf_reader *r_b;
+	// stack map name
+	char stack_map_name_a[MAP_NAME_SZ];
+	char stack_map_name_b[MAP_NAME_SZ];
+
+	// Read raw data from the eBPF perfbuf and temporarily store it.
+	struct stack_trace_key_t *raw_stack_data;
+
+	// Cache hash: obtain folded stack trace string from stack ID.
+	stack_str_hash_t stack_str_hash;
+	// Used for tracking data statistics and pushing.
+	stack_trace_msg_hash_t msg_hash;
+
+	/*
+	 * 'cpu_aggregation_flag' is used to set whether to retrieve CPUID
+	 * and include it in the aggregation of stack trace data.
+	 *
+	 * If valude is set to 1, CPUID will be retrieved and included in
+	 * the aggregation of stack trace data. If value is set to 0,
+	 * CPUID will not be retrieved and will not be included in the
+	 * aggregation. Any other value is considered invalid.
+	 */
+	volatile u64 cpu_aggregation_flag;
+
+	/* 
+	 * To perform regular expression matching on process names,
+	 * the 'profiler_regex' is set using the 'set_profiler_regex()'
+	 * interface. Processes that successfully match the regular
+	 * expression are aggregated using the key:
+	 * `{pid + stime + u_stack_id + k_stack_id + tid + cpu}`.
+	 *
+	 * For processes that do not match, they are aggregated using the
+	 * key:
+	 * `<process name + u_stack_id + k_stack_id + cpu>`.
+	 */
+	regex_t profiler_regex;
+	bool regex_existed;
+	volatile u32 regex_lock;
+
+	/*
+	 * The profiler stop flag, with 1 indicating stop and
+	 * 0 indicating running status.
+	 */
+	volatile u64 profiler_stop;
+
+	/*
+	 * This flag is used to enable the eBPF program to start working.
+	 * with 1 indicating enable and 0 indicating disable.
+	 */
+	volatile u64 enable_bpf_profile;
+
+	/*
+	 * The identifier to only retrieve matched data. This flag
+	 * setting will exclude the total process.
+	 */
+	bool only_matched_data;
+
+	/*
+	 * In the sampling scenario, the number of samples is used; in the
+	 * non-sampling scenario, real-time intervals (in nanoseconds) are
+	 * used. This setting determines whether to use time intervals,
+	 * with a default value of false.
+	 */
+	bool use_delta_time;
+
+	// Record all stack IDs in each iteration for quick retrieval.
+	struct stack_ids_bitmap stack_ids_a;
+	struct stack_ids_bitmap stack_ids_b;
+	// This vector table is used to remove a stack from the stack map.
+	int *clear_stack_ids_a;
+	int *clear_stack_ids_b;
+
+	// for stack_trace_msg_hash relese
+	stack_trace_msg_hash_kv *trace_msg_kvps;
+	bool msg_clear_hash;
+
+	/* profiler statistics */
+
+	// Switching between dual buffers.
+	u64 transfer_count;
+	// Total iteration count for all iterations.
+	u64 process_count;
+	u64 stackmap_clear_failed_count;
+	// perf buffer queue loss statistics.
+	u64 perf_buf_lost_a_count;
+	u64 perf_buf_lost_b_count;
+	/*
+	 * During the parsing process, it is possible for processes in procfs
+	 * to be missing (processes that start and exit quickly). This variable
+	 * is used to count the number of lost processes during the parsing process.
+	 */
+	atomic64_t process_lost_count;
+	// Stack error quantity statistics obtained by eBPF.
+	u64 stack_trace_err;
+	// Quantity statistics of data pushed.
+	u64 push_count;
+
+	/*
+	 * Record the time of the last data push
+	 * (in seconds since system startup)
+	 */
+	u64 last_push_time;
+};
+
+static inline void profile_regex_lock(struct profiler_context *c)
+{
+	while (__atomic_test_and_set(&c->regex_lock, __ATOMIC_ACQUIRE))
+		CLIB_PAUSE();
+}
+
+static inline void profile_regex_unlock(struct profiler_context *c)
+{
+	__atomic_clear(&c->regex_lock, __ATOMIC_RELEASE);
+}
+
+void process_bpf_stacktraces(struct profiler_context *ctx,
+			     struct bpf_tracer *t);
+int do_profiler_regex_config(const char *pattern, struct profiler_context *ctx);
+void set_bpf_run_enabled(struct bpf_tracer *t, struct profiler_context *ctx,
+			 u64 enable_flag);
+int profiler_context_init(struct profiler_context *ctx,
+			  const char *tag,
+			  u8 type,
+			  bool enable_profiler,
+			  const char *state_map_name,
+			  const char *stack_map_name_a,
+			  const char *stack_map_name_b,
+			  bool only_matched, bool use_delta_time);
+bool run_conditions_check(void);
+int java_libs_and_tools_install(void);
+void push_and_release_stack_trace_msg(struct profiler_context *ctx,
+				      stack_trace_msg_hash_t * h,
+				      bool is_force);
+#endif /*DF_USER_PROFILE_COMMON_H */
diff --git a/agent/src/ebpf/user/profile/stringifier.c b/agent/src/ebpf/user/profile/stringifier.c
index 2ec2cffde33..3e1532e403b 100644
--- a/agent/src/ebpf/user/profile/stringifier.c
+++ b/agent/src/ebpf/user/profile/stringifier.c
@@ -78,7 +78,7 @@ static const char *u_sym_prefix = "";
  * in the loss of stack data. This situation is rare and difficult
  * to occur.
  */
-static u64 stack_table_data_miss;
+static __thread u64 stack_table_data_miss;
 
 u64 get_stack_table_data_miss_count(void)
 {
@@ -159,14 +159,29 @@ static inline char *create_symbol_str(int len, char *src, const char *tag)
 }
 
 static inline int symcache_resolve(pid_t pid, void *resolver, u64 address,
-				   struct bcc_symbol *sym)
+				   struct bcc_symbol *sym, void *info_p)
 {
 	ASSERT(pid >= 0);
 
-	if (pid == 0)
-		return bcc_symcache_resolve_no_demangle(resolver, address, sym);
-	else
-		return bcc_symcache_resolve(resolver, address, sym);
+	int ret = -1;
+	if (pid == 0) {
+		symbolizer_kernel_lock();
+		ret = bcc_symcache_resolve_no_demangle(resolver, address, sym);
+		symbolizer_kernel_unlock();
+	} else {
+		struct symbolizer_proc_info *p = info_p;
+		if (p) {
+			symbolizer_proc_lock(p);
+			if ((u64) resolver != (u64) p->syms_cache) {
+				symbolizer_proc_unlock(p);
+				return (-1);
+			}
+			ret = bcc_symcache_resolve(resolver, address, sym);
+			symbolizer_proc_unlock(p);
+		}
+	}
+
+	return ret;
 }
 
 static char *symbol_name_fetch(pid_t pid, struct bcc_symbol *sym)
@@ -205,11 +220,12 @@ static char *resolve_addr(struct bpf_tracer *t, pid_t pid, bool is_start_idx,
 	char *ptr = NULL;
 	char format_str[32];
 	struct bcc_symbol sym;
+	memset(&sym, 0, sizeof(sym));
 	void *resolver = get_symbol_cache(pid, is_create);
 	if (resolver == NULL)
 		goto resolver_err;
 
-	int ret = symcache_resolve(pid, resolver, address, &sym);
+	int ret = symcache_resolve(pid, resolver, address, &sym, info_p);
 	if (ret == 0) {
 		ptr = symbol_name_fetch(pid, &sym);
 		if (ptr) {
@@ -241,9 +257,11 @@ static char *resolve_addr(struct bpf_tracer *t, pid_t pid, bool is_start_idx,
 		ptr = create_symbol_str(len, format_str, "");
 		if (info_p) {
 			struct symbolizer_proc_info *p = info_p;
+			symbolizer_proc_lock(p);
 			if (p->is_java && strstr(format_str, "perf-")) {
 				p->unknown_syms_found = true;
 			}
+			symbolizer_proc_unlock(p);
 		}
 		goto finish;
 	}
diff --git a/agent/src/ebpf/user/symbol.c b/agent/src/ebpf/user/symbol.c
index 978ab28db9c..9f75e847d9d 100644
--- a/agent/src/ebpf/user/symbol.c
+++ b/agent/src/ebpf/user/symbol.c
@@ -489,36 +489,59 @@ uint64_t get_symbol_addr_from_binary(const char *bin, const char *symname)
 #ifndef AARCH64_MUSL
 static symbol_caches_hash_t syms_cache_hash;	// for user process symbol caches
 static void *k_resolver;	// for kernel symbol cache
+static volatile u32 k_resolver_lock;
 static u64 sys_btime_msecs;	// system boot time(milliseconds)
 
+void symbolizer_kernel_lock(void)
+{
+	while (__atomic_test_and_set(&k_resolver_lock, __ATOMIC_ACQUIRE))
+		CLIB_PAUSE();
+}
+
+void symbolizer_kernel_unlock(void)
+{
+	__atomic_clear(&k_resolver_lock, __ATOMIC_RELEASE);
+}
+
 static bool inline enable_proc_info_cache(void)
 {
 	return (syms_cache_hash.buckets != NULL);
 }
 
-static inline void free_symbolizer_cache_kvp(struct symbolizer_cache_kvp *kv)
+void free_proc_cache(struct symbolizer_proc_info *p)
 {
-	if (kv->v.cache) {
-		bcc_free_symcache((void *)kv->v.cache, kv->k.pid);
+	symbolizer_proc_lock(p);
+	if (p->is_java) {
+		/* Delete target ns Java files */
+		int pid = (int)p->pid;
+		if (pid > 0) {
+			clean_local_java_symbols_files(pid);
+		}
+	}
+
+	if (p->syms_cache) {
+		bcc_free_symcache((void *)p->syms_cache, p->pid);
 		free_symcache_count++;
-		kv->v.cache = 0;
 	}
+	p->syms_cache = 0;
+	symbolizer_proc_unlock(p);
+	clib_mem_free((void *)p);
+}
 
+static void free_symbolizer_cache_kvp(struct symbolizer_cache_kvp *kv)
+{
 	if (kv->v.proc_info_p) {
 		struct symbolizer_proc_info *p;
 		p = (struct symbolizer_proc_info *)kv->v.proc_info_p;
-		if (p->is_java) {
-			/* Delete target ns Java files */
-			int pid = (int)kv->k.pid;
-			if (pid > 0) {
-				clean_local_java_symbols_files(pid);
-			}
-		}
-
 		/* Ensure that all tasks are completed before releasing. */
 		if (AO_SUB_F(&p->use, 1) == 0) {
-			clib_mem_free((void *)p);
+			if (kv->v.cache != p->syms_cache)
+				ebpf_warning
+				    ("kv->v.cache 0x%lx p->syms_cache 0x%lx\n",
+				     kv->v.cache, p->syms_cache);
+			free_proc_cache(p);
 			kv->v.proc_info_p = 0;
+			kv->v.cache = 0;
 		}
 	}
 }
@@ -574,10 +597,14 @@ static inline struct symbolizer_proc_info *add_proc_info_to_cache(struct
 
 	kv->v.proc_info_p = pointer_to_uword(p);
 	kv->v.cache = 0;
-	if (symbol_caches_hash_add_del
-	    (h, (symbol_caches_hash_kv *) kv, 1 /* is_add */ )) {
+	int ret;
+	if ((ret = symbol_caches_hash_add_del
+	     (h, (symbol_caches_hash_kv *) kv,
+	      2 /* is_add = 2, Add but do not overwrite? */ )) != 0) {
+		// If it already exists, return -2.
 		ebpf_warning
-		    ("symbol_caches_hash_add_del() failed.(pid %d)\n", pid);
+		    ("symbol_caches_hash_add_del() failed.(pid %d), return %d\n",
+		     pid, ret);
 		free_symbolizer_cache_kvp(kv);
 		return NULL;
 	} else
@@ -589,7 +616,6 @@ static inline struct symbolizer_proc_info *add_proc_info_to_cache(struct
 static inline int del_proc_info_from_cache(struct symbolizer_cache_kvp *kv)
 {
 	symbol_caches_hash_t *h = &syms_cache_hash;
-	free_symbolizer_cache_kvp(kv);
 	if (symbol_caches_hash_add_del(h, (symbol_caches_hash_kv *) kv,
 				       0 /* delete */ )) {
 		ebpf_warning
@@ -599,7 +625,7 @@ static inline int del_proc_info_from_cache(struct symbolizer_cache_kvp *kv)
 	} else {
 		__sync_fetch_and_add(&h->hash_elems_count, -1);
 	}
-
+	free_symbolizer_cache_kvp(kv);
 	return 0;
 }
 
@@ -770,6 +796,8 @@ static int config_symbolizer_proc_info(struct symbolizer_proc_info *p, int pid)
 	p->new_java_syms_file = false;
 	p->cache_need_update = false;
 	p->gen_java_syms_file_err = false;
+	p->lock = 0;
+	p->syms_cache = 0;
 	p->netns_id = get_netns_id_from_pid(pid);
 	if (p->netns_id == 0)
 		return ETR_INVAL;
@@ -800,6 +828,18 @@ static int config_symbolizer_proc_info(struct symbolizer_proc_info *p, int pid)
 	return ETR_OK;
 }
 
+static inline void write_return_value(struct symbolizer_cache_kvp *kv,
+				      struct symbolizer_proc_info *p,
+				      u64 * stime, u64 * netns_id, char *name,
+				      void **ptr)
+{
+	copy_process_name(kv, name);
+	*stime = cache_process_stime(kv);
+	*netns_id = cache_process_netns_id(kv);
+	*ptr = p;
+	AO_INC(&p->use);
+}
+
 void get_process_info_by_pid(pid_t pid, u64 * stime, u64 * netns_id, char *name,
 			     void **ptr)
 {
@@ -824,13 +864,21 @@ void get_process_info_by_pid(pid_t pid, u64 * stime, u64 * netns_id, char *name,
 		p = add_proc_info_to_cache(&kv);
 		if (p == NULL)
 			return;
+		symbolizer_proc_lock(p);
+		write_return_value(&kv, p, stime, netns_id, name, ptr);
+		symbolizer_proc_unlock(p);
+
 	} else {
 		p = (struct symbolizer_proc_info *)kv.v.proc_info_p;
+		symbolizer_proc_lock(p);
 		u64 curr_time = current_sys_time_secs();
 		if (!p->verified) {
 			if (((curr_time - (p->stime / 1000ULL)) <
 			     PROC_INFO_VERIFY_TIME)) {
-				goto fetch_proc_info;
+				write_return_value(&kv, p, stime, netns_id,
+						   name, ptr);
+				symbolizer_proc_unlock(p);
+				return;
 			}
 
 			/*
@@ -840,21 +888,24 @@ void get_process_info_by_pid(pid_t pid, u64 * stime, u64 * netns_id, char *name,
 			 * for a period of time to avoid such situations.
 			 */
 			char comm[sizeof(p->comm)];
-			u64 stime = (u64)
+			u64 proc_stime = (u64)
 			    get_process_starttime_and_comm(pid,
 							   comm,
 							   sizeof(comm));
-			if (stime == 0) {
+			if (proc_stime == 0) {
 				/* 
 				 * Here, indicate that during the symbolization process,
 				 * the process has already terminated, but the process
 				 * information has not yet been cleared. In this case, we
 				 * continue to use the previously retained information.
 				 */
-				goto fetch_proc_info;
+				write_return_value(&kv, p, stime, netns_id,
+						   name, ptr);
+				symbolizer_proc_unlock(p);
+				return;
 			}
 
-			p->stime = stime;
+			p->stime = proc_stime;
 			memcpy(p->comm, comm, sizeof(p->comm));
 			p->comm[sizeof(p->comm) - 1] = '\0';
 
@@ -866,13 +917,9 @@ void get_process_info_by_pid(pid_t pid, u64 * stime, u64 * netns_id, char *name,
 			p->verified = true;
 		}
 
+		write_return_value(&kv, p, stime, netns_id, name, ptr);
+		symbolizer_proc_unlock(p);
 	}
-
-fetch_proc_info:
-	copy_process_name(&kv, name);
-	*stime = cache_process_stime(&kv);
-	*netns_id = cache_process_netns_id(&kv);
-	*ptr = p;
 }
 
 static void *symbols_cache_update(symbol_caches_hash_t * h,
@@ -911,6 +958,7 @@ static void *symbols_cache_update(symbol_caches_hash_t * h,
 	p->new_java_syms_file = false;
 	p->add_task_list = false;
 	p->cache_need_update = false;
+	p->syms_cache = kv->v.cache;
 	return (void *)kv->v.cache;
 }
 
@@ -918,6 +966,8 @@ static inline void java_expired_update(symbol_caches_hash_t * h,
 				       struct symbolizer_cache_kvp *kv,
 				       struct symbolizer_proc_info *p)
 {
+	ASSERT(p != NULL);
+
 	/* Update java symbols table, will be executed during
 	 * the next Java symbolication */
 
@@ -948,8 +998,12 @@ void *get_symbol_cache(pid_t pid, bool new_cache)
 	ASSERT(pid >= 0);
 
 	if (k_resolver == NULL && pid == 0) {
-		k_resolver = (void *)bcc_symcache_new(-1, &lazy_opt);
-		sys_btime_msecs = get_sys_btime_msecs();
+		symbolizer_kernel_lock();
+		if (k_resolver == NULL) {
+			k_resolver = (void *)bcc_symcache_new(-1, &lazy_opt);
+			sys_btime_msecs = get_sys_btime_msecs();
+		}
+		symbolizer_kernel_unlock();
 	}
 
 	if (!new_cache)
@@ -967,6 +1021,7 @@ void *get_symbol_cache(pid_t pid, bool new_cache)
 	if (symbol_caches_hash_search(h, (symbol_caches_hash_kv *) & kv,
 				      (symbol_caches_hash_kv *) & kv) == 0) {
 		p = (struct symbolizer_proc_info *)kv.v.proc_info_p;
+		symbolizer_proc_lock(p);
 		u64 curr_time = current_sys_time_secs();
 		if (p->verified) {
 			/*
@@ -998,7 +1053,8 @@ void *get_symbol_cache(pid_t pid, bool new_cache)
 				 * generation of Java symbol tables, additional random value
 				 * for each java process's delay.
 				 */
-				if (kv.v.cache == 0 && !p->gen_java_syms_file_err) {
+				if (kv.v.cache == 0
+				    && !p->gen_java_syms_file_err) {
 					p->update_syms_table_time =
 					    generate_random_integer
 					    (PROFILER_DEFER_RANDOM_MAX);
@@ -1009,18 +1065,27 @@ void *get_symbol_cache(pid_t pid, bool new_cache)
 			    && curr_time >= p->update_syms_table_time) {
 				if (p->is_java) {
 					java_expired_update(h, &kv, p);
+					symbolizer_proc_unlock(p);
 					return (void *)kv.v.cache;
 				} else {
-					return symbols_cache_update(h, &kv, p);
+					void *ret =
+					    symbols_cache_update(h, &kv, p);
+					symbolizer_proc_unlock(p);
+					return ret;
 				}
 			}
 		} else {
+			symbolizer_proc_unlock(p);
 			/* Ensure that newly launched JAVA processes are detected. */
 			return NULL;
 		}
 
-		if (kv.v.cache)
+		if (kv.v.cache) {
+			symbolizer_proc_unlock(p);
 			return (void *)kv.v.cache;
+		}
+
+		symbolizer_proc_unlock(p);
 	}
 
 	return NULL;
diff --git a/agent/src/ebpf/user/symbol.h b/agent/src/ebpf/user/symbol.h
index 9763a694aeb..15413baff5d 100644
--- a/agent/src/ebpf/user/symbol.h
+++ b/agent/src/ebpf/user/symbol.h
@@ -82,8 +82,23 @@ struct symbolizer_proc_info {
 	char container_id[CONTAINER_ID_SIZE];
 	/* reference counting */
 	u64 use;
+	/* Protect symbolizer_proc_info from concurrent access by multiple threads. */
+	volatile u32 lock;
+	/* Recording symbol resolution cache. */
+	volatile uword syms_cache;
 };
 
+static inline void symbolizer_proc_lock(struct symbolizer_proc_info *p)
+{
+	while (__atomic_test_and_set(&p->lock, __ATOMIC_ACQUIRE))
+		CLIB_PAUSE();
+}
+
+static inline void symbolizer_proc_unlock(struct symbolizer_proc_info *p)
+{
+	__atomic_clear(&p->lock, __ATOMIC_RELEASE);
+}
+
 struct symbolizer_cache_kvp {
 	struct {
 		u64 pid;
@@ -155,27 +170,28 @@ struct symbol_tracepoint {
 	char *name;
 };
 
-static_always_inline u64
-cache_process_stime(struct symbolizer_cache_kvp *kv)
+static_always_inline u64 cache_process_stime(struct symbolizer_cache_kvp *kv)
 {
-	return (u64)((struct symbolizer_proc_info *)kv->v.proc_info_p)->stime;
+	return (u64) ((struct symbolizer_proc_info *)kv->v.proc_info_p)->stime;
 }
 
-static_always_inline u64
-cache_process_netns_id(struct symbolizer_cache_kvp *kv)
+static_always_inline u64 cache_process_netns_id(struct symbolizer_cache_kvp *kv)
 {
-	return (u64)((struct symbolizer_proc_info *)kv->v.proc_info_p)->netns_id;
+	return (u64) ((struct symbolizer_proc_info *)kv->v.
+		      proc_info_p)->netns_id;
 }
 
 static_always_inline void
 copy_process_name(struct symbolizer_cache_kvp *kv, char *dst)
 {
 	static const int len =
-		sizeof(((struct symbolizer_proc_info *)kv->v.proc_info_p)->comm);
+	    sizeof(((struct symbolizer_proc_info *) kv->v.proc_info_p)->comm);
 
 	strcpy_s_inline(dst, len,
-			((struct symbolizer_proc_info *)kv->v.proc_info_p)->comm,
-			strlen(((struct symbolizer_proc_info *)kv->v.proc_info_p)->comm));
+			((struct symbolizer_proc_info *)kv->v.
+			 proc_info_p)->comm,
+			strlen(((struct symbolizer_proc_info *)kv->
+				v.proc_info_p)->comm));
 }
 
 void free_uprobe_symbol(struct symbol_uprobe *u_sym,
@@ -198,6 +214,9 @@ u64 get_pid_stime(pid_t pid);
 void exec_proc_info_cache_update(void);
 void set_java_syms_fetch_delay(int delay_secs);
 u64 get_java_syms_fetch_delay(void);
+void free_proc_cache(struct symbolizer_proc_info *p);
+void symbolizer_kernel_lock(void);
+void symbolizer_kernel_unlock(void);
 #endif
 int create_and_init_proc_info_caches(void);
 void get_container_id_from_procs_cache(pid_t pid, uint8_t * id, int id_size);
diff --git a/agent/src/ebpf/user/tracer.c b/agent/src/ebpf/user/tracer.c
index 3e0fc0637b1..46db256ed8d 100644
--- a/agent/src/ebpf/user/tracer.c
+++ b/agent/src/ebpf/user/tracer.c
@@ -218,13 +218,6 @@ int release_bpf_tracer(const char *name)
 	if (t->release_cb != NULL)
 		t->release_cb(t);
 
-	/*
-	 * Check if the reader thread has exited before releasing
-	 * the reader lock.
-	 */
-	while (t->perf_worker[0] != 0)
-		sleep(1);
-
 	if (t->lock) {
 		clib_mem_free((void *)t->lock);
 		t->lock = NULL;
@@ -253,8 +246,8 @@ int enable_tracer_reader_work(const char *prefix_name, int idx,
 	int ret;
 	char name[TASK_COMM_LEN];
 	snprintf(name, sizeof(name), "%s-%d", prefix_name, idx);
-	ret = pthread_create(&tracer->perf_worker[idx], NULL, fn,
-			     (void *)(uint64_t)idx);
+	ret = pthread_create(&tracer->perf_workers[idx], NULL, fn,
+			     (void *)(uint64_t) idx);
 	if (ret) {
 		ebpf_warning("tracer reader(%s), pthread_create "
 			     "is error:%s\n", name, strerror(errno));
@@ -262,14 +255,14 @@ int enable_tracer_reader_work(const char *prefix_name, int idx,
 	}
 
 	/* set thread name */
-	pthread_setname_np(tracer->perf_worker[idx], name);
+	pthread_setname_np(tracer->perf_workers[idx], name);
 
 	/*
 	 * Separating threads is to automatically release
 	 * resources after pthread_exit(), without being
 	 * blocked or stuck.
 	 */
-	ret = pthread_detach(tracer->perf_worker[idx]);
+	ret = pthread_detach(tracer->perf_workers[idx]);
 	if (ret != 0) {
 		ebpf_warning("Error detaching thread, error:%s\n",
 			     strerror(errno));
@@ -346,6 +339,7 @@ struct bpf_tracer *setup_bpf_tracer(const char *name,
 	bt->release_cb = free_cb;
 	bt->create_cb = create_cb;
 	bt->sample_freq = sample_freq;
+	bt->enable_sample = false;
 
 	bt->dispatch_workers_nr = workers_nr;
 	bt->process_fn = handle;
@@ -871,8 +865,8 @@ static int tracepoint_detach(struct tracepoint *tp)
 int tracer_hooks_process(struct bpf_tracer *tracer, enum tracer_hook_type type,
 			 int *probes_count)
 {
-	int (*probe_fun) (struct probe * p) = NULL;
-	int (*tracepoint_fun) (struct tracepoint * p) = NULL;
+	int (*probe_fun)(struct probe * p) = NULL;
+	int (*tracepoint_fun)(struct tracepoint * p) = NULL;
 	if (type == HOOK_ATTACH) {
 		probe_fun = probe_attach;
 		tracepoint_fun = tracepoint_attach;
@@ -959,6 +953,10 @@ int tracer_hooks_process(struct bpf_tracer *tracer, enum tracer_hook_type type,
 	}
 
 perf_event:
+
+	if (!tracer->enable_sample)
+		return ETR_OK;
+
 	/*
 	 * perf event
 	 */
@@ -1010,8 +1008,8 @@ int tracer_hooks_process(struct bpf_tracer *tracer, enum tracer_hook_type type,
 			errno = 0;
 			int ret =
 			    program__detach_perf_event(tracer->per_cpu_fds,
-						       ARRAY_SIZE(tracer->
-								  per_cpu_fds));
+						       ARRAY_SIZE
+						       (tracer->per_cpu_fds));
 			if (!ret) {
 				ebpf_info
 				    ("tracer \"%s\" detach perf event prog successful.\n",
@@ -1169,7 +1167,7 @@ static int perf_reader_setup(struct bpf_perf_reader *perf_reader, int thread_nr)
 			spread_id = 0;
 
 		struct reader_forward_info *fwd_info =
-			malloc(sizeof(struct reader_forward_info));
+		    malloc(sizeof(struct reader_forward_info));
 		if (fwd_info == NULL) {
 			ebpf_error("reader_forward_info malloc() failed.\n");
 			return ETR_NOMEM;
@@ -1181,12 +1179,10 @@ static int perf_reader_setup(struct bpf_perf_reader *perf_reader, int thread_nr)
 
 		ebpf_info("Perf buffer reader cpu(%d) -> queue(%d)\n",
 			  fwd_info->cpu_id, fwd_info->queue_id);
-		reader =
-		    (struct perf_reader *)
+		reader = (struct perf_reader *)
 		    bpf_open_perf_buffer(perf_reader->raw_cb,
 					 perf_reader->lost_cb,
-					 (void *)fwd_info, -1, i,
-					 pages_cnt);
+					 (void *)fwd_info, -1, i, pages_cnt);
 		if (reader == NULL) {
 			ebpf_error("bpf_open_perf_buffer() failed.\n");
 			return ETR_NORESOURCE;
@@ -1436,7 +1432,7 @@ static int tracer_sockopt_set(sockoptid_t opt, const void *conf, size_t size)
 }
 
 static int tracer_sockopt_get(sockoptid_t opt, const void *conf, size_t size,
-			      void **out, size_t * outsize)
+			      void **out, size_t *outsize)
 {
 	*outsize = sizeof(struct bpf_tracer_param_array) +
 	    sizeof(struct bpf_tracer_param) * tracers_count;
diff --git a/agent/src/ebpf/user/tracer.h b/agent/src/ebpf/user/tracer.h
index 3a1eff79ef8..411795ca0eb 100644
--- a/agent/src/ebpf/user/tracer.h
+++ b/agent/src/ebpf/user/tracer.h
@@ -347,11 +347,12 @@ struct bpf_tracer {
 	 */
 	int per_cpu_fds[MAX_CPU_NR];
 	int sample_freq; // sample frequency, Hertz.
+	bool enable_sample; // Enable CPU sampling?
 
 	/*
 	 * Data distribution processing worker, queues
 	 */
-	pthread_t perf_worker[MAX_CPU_NR];      // Main thread for user-space receiving perf-buffer data
+	pthread_t perf_workers[MAX_CPU_NR];      // Main thread for user-space receiving perf-buffer data
 	pthread_t dispatch_workers[MAX_CPU_NR]; // Dispatch threads
 	int dispatch_workers_nr;                // Number of dispatch threads
 	struct queue queues[MAX_CPU_NR];        // Dispatch queues, each dispatch thread has its corresponding queue.
diff --git a/server/agent_config/example.yaml b/server/agent_config/example.yaml
index 7eca5f3d362..196dd40f86c 100644
--- a/server/agent_config/example.yaml
+++ b/server/agent_config/example.yaml
@@ -1310,6 +1310,15 @@ vtap_group_id: g-xxxxxx
     ## Note: Only collect IO events with delay exceeding this threshold, the default value is 1ms.
     #io-event-minimal-duration: 1ms
 
+    ## Java compliant update latency time
+    ## Default: 600s. Range: [5, 3600]s
+    ## Note:
+    ##   When deepflow-agent finds that an unresolved function name appears in the function call stack
+    ##   of a Java process, it will trigger the regeneration of the symbol file of the process.
+    ##   Because Java utilizes the Just-In-Time (JIT) compilation mechanism, to obtain more symbols for
+    ##   Java processes, the regeneration will be deferred for a period of time.
+    #java-symbol-file-refresh-defer-interval: 600s
+
     ## on-cpu profile configuration
     #on-cpu-profile:
       ## eBPF on-cpu Profile Switch
@@ -1334,13 +1343,27 @@ vtap_group_id: g-xxxxxx
       ## Default: ^deepflow-.*
       #regex: ^deepflow-.*
 
-      ## When deepflow-agent finds that an unresolved function name appears in the function call stack
-      ## of a Java process, it will trigger the regeneration of the symbol file of the process.
-      ## Because Java utilizes the Just-In-Time (JIT) compilation mechanism, to obtain more symbols for
-      ## Java processes, the regeneration will be deferred for a period of time.
-      ## Default: 600s. Range: [5, 3600]s
-      ## The unit of measurement used is seconds.
-      #java-symbol-file-refresh-defer-interval: 600s
+    ## off-cpu profile configuration
+    #off-cpu-profile:
+      ## eBPF off-cpu Profile Switch
+      ## Default: true
+      #disabled: true
+
+      ## Sampling process name
+      ## Default: ^deepflow-.*
+      #regex: ^deepflow-.*
+
+      ## Configure the minimum blocking event time
+      ## Default: 1us. Range: [1, 2^32-1)us
+      ## Note:
+      ##   Scheduler events are still high-frequency events, as their rate may exceed 1 million events
+      ##   per second, so caution should still be exercised.
+      ##   If overhead remains an issue, you can configure the 'minblock' tunable parameter here.
+      ##   If the off-CPU time is less than the value configured in this item, the data will be discarded.
+      ##   If your goal is to trace longer blocking events, increasing this parameter can filter out shorter
+      ##   blocking events, further reducing overhead. Additionally, we will not collect events with a block
+      ##   time exceeding 1 hour.
+      #minblock: 1us
 
   ######################################
   ## Agent Running in Standalone Mode ##