From c729e69c69bb788494cd6245da4b5fa210c612da Mon Sep 17 00:00:00 2001
From: Jiping Yin <jiping@yunshan.net>
Date: Fri, 3 May 2024 16:31:55 +0800
Subject: [PATCH] Modify the interface process_bpf_stacktraces()

---
 agent/src/ebpf/code.style                     |   6 +
 .../ebpf/samples/rust/profiler/src/main.rs    |   8 +-
 agent/src/ebpf/user/profile/perf_profiler.c   | 447 ++----------------
 agent/src/ebpf/user/profile/perf_profiler.h   |  33 +-
 agent/src/ebpf/user/profile/profile_common.c  | 357 +++++++++++++-
 agent/src/ebpf/user/profile/profile_common.h  |  18 +
 6 files changed, 432 insertions(+), 437 deletions(-)
 create mode 100755 agent/src/ebpf/code.style

diff --git a/agent/src/ebpf/code.style b/agent/src/ebpf/code.style
new file mode 100755
index 00000000000..ceb7791f998
--- /dev/null
+++ b/agent/src/ebpf/code.style
@@ -0,0 +1,6 @@
+#! /bin/bash
+
+indent -npro -kr -i8 -ts8 -nss -nsc -ncs -nprs -sob -l80 -ss -cp1 --space-after-for --space-after-if --space-after-while --space-special-semicolon --blank-lines-after-procedures -v $1
+sed -i "s/{ }/{}/g" $1
+sed -i "s/) ;/);/g" $1
+sed -i "s/^      //g" $1
diff --git a/agent/src/ebpf/samples/rust/profiler/src/main.rs b/agent/src/ebpf/samples/rust/profiler/src/main.rs
index 95219c49106..4d44c423117 100644
--- a/agent/src/ebpf/samples/rust/profiler/src/main.rs
+++ b/agent/src/ebpf/samples/rust/profiler/src/main.rs
@@ -201,10 +201,10 @@ fn main() {
 
         bpf_tracer_finish();
 
-        //if cpdbg_set_config(60, debug_callback) != 0 {
-        //    println!("cpdbg_set_config() error");
-        //    ::std::process::exit(1);
-        //}
+        if cpdbg_set_config(60, debug_callback) != 0 {
+            println!("cpdbg_set_config() error");
+            ::std::process::exit(1);
+        }
 
         let stats = socket_tracer_stats();
         print!("{:#?}\n", stats);
diff --git a/agent/src/ebpf/user/profile/perf_profiler.c b/agent/src/ebpf/user/profile/perf_profiler.c
index 1f196896eb1..73c244cd1cd 100644
--- a/agent/src/ebpf/user/profile/perf_profiler.c
+++ b/agent/src/ebpf/user/profile/perf_profiler.c
@@ -64,24 +64,14 @@ extern char linux_release[128];
 extern __thread uword thread_index;
 
 struct stack_trace_key_t *raw_stack_data;
-static u64 stack_trace_lost;
 struct bpf_tracer *profiler_tracer;
 
-// for stack_trace_msg_hash relese
-static __thread stack_trace_msg_hash_kv *trace_msg_kvps;
-static __thread bool msg_clear_hash;
-
 // for flame-graph test
 static FILE *folded_file;
 #define FOLDED_FILE_PATH "./profiler.folded"
 static char *flame_graph_start_time;
 static char *flame_graph_end_time;
 
-/* Record the time of the last data push
- * (in seconds since system startup)*/
-static u64 last_push_time;
-static u64 push_count;
-
 /*
  * 'cpu_aggregation_flag' is used to set whether to retrieve CPUID
  * and include it in the aggregation of stack trace data.
@@ -103,15 +93,8 @@ static stack_str_hash_t g_stack_str_hash;
  */
 static stack_trace_msg_hash_t g_msg_hash;
 
-/*
- * The iteration count causes BPF to switch buffers with each iteration.
- */
-static u64 transfer_count;
 static u64 process_count;
 
-static void print_profiler_status(struct bpf_tracer *t, u64 iter_count,
-				  stack_str_hash_t * h,
-				  stack_trace_msg_hash_t * msg_h);
 static void print_cp_tracer_status(struct bpf_tracer *t,
 				   struct profiler_context *ctx);
 
@@ -123,18 +106,6 @@ static bool cpdbg_use_remote;
 static uint32_t cpdbg_start_time;
 static uint32_t cpdbg_timeout;
 
-/* Record all stack IDs in each iteration for quick retrieval. */
-struct stack_ids_bitmap stack_ids_a;
-struct stack_ids_bitmap stack_ids_b;
-/* This vector table is used to remove a stack from the stack map. */
-static int *clear_stack_ids_a;
-static int *clear_stack_ids_b;
-static u64 stackmap_clear_failed_count;
-
-/* perf buffer queue loss statistics */
-static u64 perf_buf_lost_a_count;
-static u64 perf_buf_lost_b_count;
-
 static u64 get_process_lost_count(struct profiler_context *ctx)
 {
 	return atomic64_read(&ctx->process_lost_count);
@@ -258,14 +229,14 @@ static void reader_lost_cb_a(void *cookie, u64 lost)
 {
 	struct bpf_tracer *tracer = profiler_tracer;
 	atomic64_add(&tracer->lost, lost);
-	perf_buf_lost_a_count++;
+	oncpu_ctx.perf_buf_lost_a_count++;
 }
 
 static void reader_lost_cb_b(void *cookie, u64 lost)
 {
 	struct bpf_tracer *tracer = profiler_tracer;
 	atomic64_add(&tracer->lost, lost);
-	perf_buf_lost_b_count++;
+	oncpu_ctx.perf_buf_lost_b_count++;
 }
 
 static void reader_raw_cb(void *cookie, void *raw, int raw_size)
@@ -312,16 +283,6 @@ static int release_profiler(struct bpf_tracer *tracer)
 	return ETR_OK;
 }
 
-static int init_stack_trace_msg_hash(stack_trace_msg_hash_t * h,
-				     const char *name)
-{
-	memset(h, 0, sizeof(*h));
-	u32 nbuckets = STACK_TRACE_MSG_HASH_BUCKETS_NUM;
-	u64 hash_memory_size = STACK_TRACE_MSG_HASH_MEM_SZ;
-	return stack_trace_msg_hash_init(h, (char *)name,
-					 nbuckets, hash_memory_size);
-}
-
 static inline bool is_cpdbg_timeout(void)
 {
 	uint32_t passed_sec;
@@ -372,7 +333,7 @@ static void print_cp_data(stack_trace_msg_t * msg)
 	}
 }
 
-static void cpdbg_process(stack_trace_msg_t * msg)
+void cpdbg_process(stack_trace_msg_t * msg)
 {
 	pthread_mutex_lock(&cpdbg_mutex);
 	if (unlikely(cpdbg_enable)) {
@@ -383,114 +344,26 @@ static void cpdbg_process(stack_trace_msg_t * msg)
 	pthread_mutex_unlock(&cpdbg_mutex);
 }
 
-static int push_and_free_msg_kvp_cb(stack_trace_msg_hash_kv * kv, void *arg)
-{
-	struct profiler_context *ctx = arg;
-	stack_trace_msg_kv_t *msg_kv = (stack_trace_msg_kv_t *) kv;
-	if (msg_kv->msg_ptr != 0) {
-		stack_trace_msg_t *msg = (stack_trace_msg_t *) msg_kv->msg_ptr;
-
-		/* continuous profiler debug */
-		cpdbg_process(msg);
-
-		tracer_callback_t fun = profiler_tracer->process_fn;
-		/*
-		 * Execute callback function to hand over the data to the
-		 * higher level for processing. The higher level will se-
-		 * nd the data to the server for storage as required.
-		 */
-		if (likely(ctx->profiler_stop == 0))
-			fun(msg);
-
-		clib_mem_free((void *)msg);
-		msg_kv->msg_ptr = 0;
-	}
-
-	int ret = VEC_OK;
-	vec_add1(trace_msg_kvps, *kv, ret);
-	if (ret != VEC_OK) {
-		ebpf_warning("vec add failed\n");
-		msg_clear_hash = true;
-	}
-
-	return BIHASH_WALK_CONTINUE;
-}
-
-/*
- * Push the data and release the resources.
- * @is_force: Do you need to perform a forced release?
- */
-static void push_and_release_stack_trace_msg(struct profiler_context *ctx,
-					     stack_trace_msg_hash_t * h,
-					     bool is_force)
-{
-	ASSERT(profiler_tracer != NULL);
-
-	u64 curr_time, elapsed;
-	curr_time = gettime(CLOCK_MONOTONIC, TIME_TYPE_NAN);
-	elapsed = curr_time - last_push_time;
-	/*
-	 * If the aggregated stack trace data obtained by the profiler
-	 * satisfies one of the following conditions, it should be pushed
-	 * to the upper-level processing:
-	 *
-	 *   If the time interval since the last push exceeds or equals
-	 *   the maximum time interval (MAX_PUSH_MSG_TIME_INTERVAL).
-	 *
-	 * Otherwise, it should return directly.
-	 */
-	if (!((elapsed >= MAX_PUSH_MSG_TIME_INTERVAL) || is_force))
-		return;
-
-	/* update last push time. */
-	last_push_time = curr_time;
-	push_count++;
-
-	stack_trace_msg_hash_foreach_key_value_pair(h, push_and_free_msg_kvp_cb,
-						    (void *)ctx);
-	/*
-	 * In this iteration, all elements will be cleared, and in the
-	 * next iteration, this hash will be reused.
-	 */
-	stack_trace_msg_hash_kv *v;
-	vec_foreach(v, trace_msg_kvps) {
-		if (stack_trace_msg_hash_add_del(h, v, 0 /* delete */ )) {
-			ebpf_warning
-			    ("stack_trace_msg_hash_add_del() failed.\n");
-			msg_clear_hash = true;
-		}
-	}
-
-	vec_free(trace_msg_kvps);
-
-	h->hit_hash_count = 0;
-	h->hash_elems_count = 0;
-
-	if (msg_clear_hash) {
-		msg_clear_hash = false;
-		stack_trace_msg_hash_free(h);
-	}
-}
-
-static inline void add_stack_id_to_bitmap(int stack_id, bool is_a)
+static inline void add_stack_id_to_bitmap(struct profiler_context *ctx,
+					  int stack_id, bool is_a)
 {
 	if (stack_id < 0)
 		return;
 
 	struct stack_ids_bitmap *ids;
 	if (is_a)
-		ids = &stack_ids_a;
+		ids = &ctx->stack_ids_a;
 	else
-		ids = &stack_ids_b;
+		ids = &ctx->stack_ids_b;
 
 	if (!is_set_bitmap(ids->bitmap, stack_id)) {
 		set_bitmap(ids->bitmap, stack_id);
 		int ret = VEC_OK;
 
 		if (is_a)
-			vec_add1(clear_stack_ids_a, stack_id, ret);
+			vec_add1(ctx->clear_stack_ids_a, stack_id, ret);
 		else
-			vec_add1(clear_stack_ids_b, stack_id, ret);
+			vec_add1(ctx->clear_stack_ids_b, stack_id, ret);
 
 		if (ret != VEC_OK) {
 			ebpf_warning("vec add failed\n");
@@ -549,12 +422,12 @@ static inline void update_matched_process_in_total(struct profiler_context *ctx,
 	}
 }
 
-static void aggregate_stack_traces(struct profiler_context *ctx,
-				   struct bpf_tracer *t,
-				   const char *stack_map_name,
-				   stack_str_hash_t * stack_str_hash,
-				   stack_trace_msg_hash_t * msg_hash,
-				   u32 * count, bool use_a_map)
+void aggregate_stack_traces(struct profiler_context *ctx,
+			    struct bpf_tracer *t,
+			    const char *stack_map_name,
+			    stack_str_hash_t * stack_str_hash,
+			    stack_trace_msg_hash_t * msg_hash,
+			    u32 * count, bool use_a_map)
 {
 	struct stack_trace_key_t *v;
 	vec_foreach(v, raw_stack_data) {
@@ -586,13 +459,13 @@ static void aggregate_stack_traces(struct profiler_context *ctx,
 
 		/* -EEXIST: Hash bucket collision in the stack trace table */
 		if (v->kernstack == -EEXIST)
-			stack_trace_lost++;
+			ctx->stack_trace_err++;
 
 		if (v->userstack == -EEXIST)
-			stack_trace_lost++;
+			ctx->stack_trace_err++;
 
-		add_stack_id_to_bitmap(v->kernstack, use_a_map);
-		add_stack_id_to_bitmap(v->userstack, use_a_map);
+		add_stack_id_to_bitmap(ctx, v->kernstack, use_a_map);
+		add_stack_id_to_bitmap(ctx, v->userstack, use_a_map);
 
 		/* Total iteration count for this iteration. */
 		(*count)++;
@@ -752,238 +625,6 @@ static void aggregate_stack_traces(struct profiler_context *ctx,
 	vec_free(raw_stack_data);
 }
 
-static u32 delete_all_stackmap_elems(struct bpf_tracer *tracer,
-				     const char *stack_map_name)
-{
-	struct ebpf_map *map =
-	    ebpf_obj__get_map_by_name(tracer->obj, stack_map_name);
-	if (map == NULL) {
-		ebpf_warning("[%s] map(name:%s) is NULL.\n", __func__,
-			     stack_map_name);
-		return 0;
-	}
-	int map_fd = map->fd;
-
-	u32 key = 0, next_key;
-	u32 reclaim_count = 0;
-	u32 find_count = 0;
-	struct list_head clear_elem_head;
-	init_list_head(&clear_elem_head);
-
-	while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
-		find_count++;
-		insert_list(&next_key, sizeof(next_key), &clear_elem_head);
-		key = next_key;
-	}
-
-	reclaim_count = __reclaim_map(map_fd, &clear_elem_head);
-
-	ebpf_info("[%s] table %s find_count %u reclaim_count :%u\n",
-		  __func__, stack_map_name, find_count, reclaim_count);
-
-	return reclaim_count;
-}
-
-static void cleanup_stackmap(struct bpf_tracer *t,
-			     const char *stack_map_name, bool is_a)
-{
-	struct stack_ids_bitmap *ids;
-	int *clear_stack_ids;
-	u64 *perf_buf_lost_p = NULL;
-
-	if (is_a) {
-		ids = &stack_ids_a;
-		clear_stack_ids = clear_stack_ids_a;
-		perf_buf_lost_p = &perf_buf_lost_a_count;
-	} else {
-		ids = &stack_ids_b;
-		clear_stack_ids = clear_stack_ids_b;
-		perf_buf_lost_p = &perf_buf_lost_b_count;
-	}
-
-	if (ids->count != vec_len(clear_stack_ids)) {
-		ebpf_warning
-		    ("stack_ids.count(%lu) != vec_len(clear_stack_ids)(%d)",
-		     ids->count, vec_len(clear_stack_ids));
-	}
-
-	/*
-	 * The perf profiler utilizes a perf buffer (per CPUs) for transporting stack data,
-	 * which may lead to out-of-order behavior in a multi-core environment.
-	 * We have employed a threshold to delay the cleanup of the stack map, reducing the
-	 * occurrence of premature clearing of stack entries caused by the disorder in stack
-	 * data.
-	 *
-	 * Examine the detailed explanation of 'STACKMAP_CLEANUP_THRESHOLD' in
-	 * 'agent/src/ebpf/user/config.h'.
-	 */
-	if (ids->count >= STACKMAP_CLEANUP_THRESHOLD) {
-		int *sid;
-		vec_foreach(sid, clear_stack_ids) {
-			int id = *sid;
-			if (!bpf_table_delete_key(t, stack_map_name, (u64) id)) {
-				/*
-				 * It may be due to the disorder in the perf buffer transmission,
-				 * leading to the repetitive deletion of the same stack ID.
-				 */
-				stackmap_clear_failed_count++;
-			}
-
-			clear_bitmap(ids->bitmap, id);
-		}
-
-		if (is_a)
-			vec_free(clear_stack_ids_a);
-		else
-			vec_free(clear_stack_ids_b);
-
-		ids->count = 0;
-
-		/*
-		 * If data loss occurs due to the user-space receiver program
-		 * being too busy and not promptly fetching data from the perf
-		 * buffer, it is necessary to clean the stack map once to prevent
-		 * excessive remnants of stack data from affecting the acquisition
-		 * of new stack data (i.e., eBPF using the bpf_get_stackid()
-		 * interface will return -EEXIST).
-		 */
-		if (*perf_buf_lost_p > 0) {
-			delete_all_stackmap_elems(t, stack_map_name);
-			*perf_buf_lost_p = 0;
-		}
-	}
-}
-
-static void process_bpf_stacktraces(struct profiler_context *ctx,
-				    struct bpf_tracer *t,
-				    struct bpf_perf_reader *r_a,
-				    struct bpf_perf_reader *r_b)
-{
-	struct bpf_perf_reader *r;
-	const char *stack_map_name;
-	bool using_map_set_a = (transfer_count % 2 == 0);
-	r = using_map_set_a ? r_a : r_b;
-	stack_map_name = using_map_set_a ? MAP_STACK_A_NAME : MAP_STACK_B_NAME;
-	const u64 sample_count_idx =
-	    using_map_set_a ? SAMPLE_CNT_A_IDX : SAMPLE_CNT_B_IDX;
-
-	struct epoll_event events[r->readers_count];
-	int nfds = reader_epoll_wait(r, events, 0);
-
-	transfer_count++;
-	/* update map MAP_PROFILER_STATE_MAP */
-	if (bpf_table_set_value(t, MAP_PROFILER_STATE_MAP,
-				TRANSFER_CNT_IDX, &transfer_count) == false) {
-		ebpf_warning("profiler state map update error."
-			     "(%s transfer_count %lu) - %s\n",
-			     MAP_PROFILER_STATE_MAP,
-			     transfer_count, strerror(errno));
-		transfer_count--;
-	}
-
-	/* Total iteration count for this iteration. */
-	u32 count = 0;
-
-	/* eBPF map record count for this iteration. */
-	u64 sample_cnt_val = 0;
-
-	/*
-	 * Why use g_stack_str_hash?
-	 *
-	 * When the stringizer encounters a stack-ID for the first time in
-	 * the stack trace table, it clears it. If a stack-ID is reused by
-	 * different stack trace keys, the stringizer returns its memoized
-	 * stack trace string. Since stack IDs are unstable between profile
-	 * iterations, we create and destroy the stringizer in each profile
-	 * iteration.
-	 */
-	if (unlikely(g_stack_str_hash.buckets == NULL)) {
-		if (init_stack_str_hash(&g_stack_str_hash, "profile_stack_str")) {
-			ebpf_warning("init_stack_str_hash() failed.\n");
-			return;
-		}
-	}
-
-	/*
-	 * During each transmission iteration, we have a hashmap structure in
-	 * place for the following purposes:
-	 *
-	 * 1 Pushing the data of this iteration to the higher-level processing.
-	 * 2 Performing data statistics based on the stack trace data, using the
-	 *   combination of "tgid + tgid_start_time + pid + cpu + k_stack_id +
-	 *   u_stack_id + " as the key.
-	 *
-	 * Here is the key-value pair structure of the hashmap:
-	 * see perf_profiler.h (stack_trace_msg_kv_t)
-	 * This is the final form of the data. If the current stack trace message
-	 * is a match, we only need to increment the count field in the correspon-
-	 * ding value, thus avoiding duplicate parsing.
-	 */
-	if (unlikely(g_msg_hash.buckets == NULL)) {
-		if (init_stack_trace_msg_hash(&g_msg_hash, "stack_trace_msg")) {
-			ebpf_warning("init_stack_trace_msg_hash() failed.\n");
-			return;
-		}
-	}
-
-	if (nfds > 0) {
-
-	      check_again:
-		if (unlikely(ctx->profiler_stop == 1))
-			goto release_iter;
-
-		/* 
-		 * If there is data, the reader's callback
-		 * function will be called.
-		 */
-		reader_event_read(events, nfds);
-
-		/*
-		 * After the reader completes data reading, the work of
-		 * data aggregation will be blocked if there is no data.
-		 */
-		aggregate_stack_traces(ctx, t, stack_map_name,
-				       &g_stack_str_hash, &g_msg_hash, &count,
-				       using_map_set_a);
-
-		/*
-		 * To ensure that all data in the perf ring-buffer is procenssed
-		 * in this iteration, as this iteration will clean up all the
-		 * data recorded in the stackmap, any residual data in the perf
-		 * ring-buffer will be carried over to the next iteration for
-		 * processing. This poses a risk of not being able to find the
-		 * corresponding stackmap records in the next iteration, leading
-		 * to incomplete processing.
-		 */
-		if (bpf_table_get_value(t, MAP_PROFILER_STATE_MAP,
-					sample_count_idx,
-					(void *)&sample_cnt_val)) {
-			if (sample_cnt_val > count) {
-				nfds = reader_epoll_short_wait(r, events, 0);
-				if (nfds > 0)
-					goto check_again;
-			}
-		}
-	}
-
-release_iter:
-
-	cleanup_stackmap(t, stack_map_name, using_map_set_a);
-
-	/* Now that we've consumed the data, reset the sample count in BPF. */
-	sample_cnt_val = 0;
-	bpf_table_set_value(t, MAP_PROFILER_STATE_MAP,
-			    sample_count_idx, &sample_cnt_val);
-
-	print_profiler_status(t, count, &g_stack_str_hash, &g_msg_hash);
-
-	/* free all elems */
-	clean_stack_strs(&g_stack_str_hash);
-
-	/* Push messages and free stack_trace_msg_hash */
-	push_and_release_stack_trace_msg(ctx, &g_msg_hash, false);
-}
-
 static void java_syms_update_work(void *arg)
 {
 	java_syms_update_main(arg);
@@ -993,9 +634,8 @@ static void oncpu_reader_work(void *arg)
 {
 	thread_index = THREAD_PROFILER_READER_IDX;
 	struct bpf_tracer *t = profiler_tracer;
-	struct bpf_perf_reader *reader_a, *reader_b;
-	reader_a = &t->readers[0];
-	reader_b = &t->readers[1];
+	oncpu_ctx.r_a = &t->readers[0];
+	oncpu_ctx.r_b = &t->readers[1];
 
 	for (;;) {
 		if (unlikely(oncpu_ctx.profiler_stop == 1)) {
@@ -1028,7 +668,7 @@ static void oncpu_reader_work(void *arg)
 			set_enable_profiler(t, &oncpu_ctx, 1);
 
 		tracer_reader_lock(t);
-		process_bpf_stacktraces(&oncpu_ctx, t, reader_a, reader_b);
+		process_bpf_stacktraces(&oncpu_ctx, t);
 		tracer_reader_unlock(t);
 	}
 
@@ -1197,7 +837,7 @@ static void print_cp_tracer_status(struct bpf_tracer *t,
 		  "perf_buf_lost_b:\t%lu process_lost_count:\t%lu "
 		  "stack_table_data_miss:\t%lu\n"
 		  "stackmap_clear_failed_count\t%lu\n"
-		  "stack_trace_lost:\t%lu\ntransfer_count:\t%lu "
+		  "stack_trace_err:\t%lu\ntransfer_count:\t%lu "
 		  "iter_count_avg:\t%.2lf\nalloc_b:\t%lu bytes "
 		  "free_b:\t%lu bytes use:\t%lu bytes\n"
 		  "eBPF map status:\n"
@@ -1207,39 +847,20 @@ static void print_cp_tracer_status(struct bpf_tracer *t,
 		  " - iter_max_cnt:\t%lu\n"
 		  "----------------------------\n\n",
 		  atomic64_read(&t->recv), process_count,
-		  atomic64_read(&t->lost), perf_buf_lost_a_count,
-		  perf_buf_lost_b_count, perf_buf_lost_a_count,
-		  perf_buf_lost_b_count, get_process_lost_count(ctx),
+		  atomic64_read(&t->lost), ctx->perf_buf_lost_a_count,
+		  ctx->perf_buf_lost_b_count, ctx->perf_buf_lost_a_count,
+		  ctx->perf_buf_lost_b_count, get_process_lost_count(ctx),
 		  get_stack_table_data_miss_count(),
-		  stackmap_clear_failed_count, stack_trace_lost, transfer_count,
-		  ((double)atomic64_read(&t->recv) / (double)transfer_count),
-		  alloc_b, free_b, alloc_b - free_b, output_count,
-		  sample_drop_cnt, output_err_cnt, iter_max_cnt);
-}
-
-static void print_profiler_status(struct bpf_tracer *t, u64 iter_count,
-				  stack_str_hash_t * h,
-				  stack_trace_msg_hash_t * msg_h)
-{
-	u64 alloc_b, free_b;
-	get_mem_stat(&alloc_b, &free_b);
-	ebpf_debug("\n\n----------------------------\nrecv envent:\t%lu\n"
-		   "kern_lost:\t%lu, perf_buf_lost_a:\t%lu, perf_buf_lost_b:\t%lu\n"
-		   "stack_trace_lost:\t%lu\n"
-		   "stackmap_clear_failed_count\t%lu\n"
-		   "ransfer_count:\t%lu iter_count:\t%lu\nall"
-		   "oc_b:\t%lu bytes free_b:\t%lu bytes use:\t%lu bytes\n"
-		   "stack_str_hash.hit_count %lu\nstack_trace_msg_hash hit %lu\n",
-		   atomic64_read(&t->recv), atomic64_read(&t->lost),
-		   perf_buf_lost_a_count, perf_buf_lost_b_count,
-		   stack_trace_lost, stackmap_clear_failed_count,
-		   transfer_count, iter_count,
-		   alloc_b, free_b, alloc_b - free_b,
-		   h->hit_hash_count, msg_h->hit_hash_count);
+		  ctx->stackmap_clear_failed_count, ctx->stack_trace_err,
+		  ctx->transfer_count,
+		  ((double)atomic64_read(&t->recv) /
+		   (double)ctx->transfer_count), alloc_b, free_b,
+		  alloc_b - free_b, output_count, sample_drop_cnt,
+		  output_err_cnt, iter_max_cnt);
 }
 
 static int cpdbg_sockopt_get(sockoptid_t opt, const void *conf, size_t size,
-			     void **out, size_t * outsize)
+			     void **out, size_t *outsize)
 {
 	return 0;
 }
@@ -1413,7 +1034,7 @@ void release_flame_graph_hash(void)
 		  "<<< stack_count %lu add_count %lu hit_count %lu msg_ptr_zero"
 		  "_count %lu push_count %lu >>>\n", stack_count,
 		  test_add_count, test_hit_count, msg_ptr_zero_count,
-		  push_count);
+		  oncpu_ctx.push_count);
 
 	ebpf_info(LOG_CP_TAG
 		  "Please use the following command to generate a flame graph:"
diff --git a/agent/src/ebpf/user/profile/perf_profiler.h b/agent/src/ebpf/user/profile/perf_profiler.h
index 2614e06db0c..9432a104c5d 100644
--- a/agent/src/ebpf/user/profile/perf_profiler.h
+++ b/agent/src/ebpf/user/profile/perf_profiler.h
@@ -17,7 +17,7 @@
 #ifndef DF_USER_PERF_PROFILER_H
 #define DF_USER_PERF_PROFILER_H
 #define CP_PROFILE_SET_PROBES
-#include "offcpu.h" // identoffcpu
+#include "offcpu.h"		// identoffcpu
 #include "../bihash_24_8.h"
 #include "../../kernel/include/perf_profiler.h"
 
@@ -51,22 +51,20 @@ typedef struct {
 	union {
 		struct {
 			/*
-		 	 * tgid:(max 67,108,864)
-		 	 *   The tgid (Thread Group ID) in kernel space
-		 	 *   is equivalent to the process ID in user space.
-		 	 * pid:(max 67,108,864)
-		 	 *   The process ID or thread ID in kernel space.
-		 	 * cpu: (max 4,096)
-		 	 *   Which CPU core does the perf event occur on?
-		 	 */
-			u64 tgid: 26,
-			    pid: 26, 
-			    cpu: 12;
+			 * tgid:(max 67,108,864)
+			 *   The tgid (Thread Group ID) in kernel space
+			 *   is equivalent to the process ID in user space.
+			 * pid:(max 67,108,864)
+			 *   The process ID or thread ID in kernel space.
+			 * cpu: (max 4,096)
+			 *   Which CPU core does the perf event occur on?
+			 */
+			u64 tgid:26, pid:26, cpu:12;
 
 			/*
 			 * process start time(the number of millisecond
 			 * elapsed since January 1, 1970 00:00:00).
- 			 */
+			 */
 			u64 stime;
 			u32 u_stack_id;
 			u32 k_stack_id;
@@ -75,15 +73,13 @@ typedef struct {
 		/* Matching and combining for process/thread name. */
 		struct {
 			u8 comm[TASK_COMM_LEN];
-			u64 pid: 26,
-			    reserved: 26,
-			    cpu: 12;
+			u64 pid:26, reserved:26, cpu:12;
 		} c_k;
 	};
 
 	/* Store perf profiler data */
 	uword msg_ptr;
-} stack_trace_msg_kv_t; 
+} stack_trace_msg_kv_t;
 
 /*
  * stack trace message value, push data
@@ -153,10 +149,11 @@ int stop_continuous_profiler(void);
 int start_continuous_profiler(int freq, int java_syms_space_limit,
 			      int java_syms_update_delay,
 			      tracer_callback_t callback);
-void process_stack_trace_data_for_flame_graph(stack_trace_msg_t *val);
+void process_stack_trace_data_for_flame_graph(stack_trace_msg_t * val);
 void release_flame_graph_hash(void);
 int set_profiler_regex(const char *pattern);
 int set_profiler_cpu_aggregation(int flag);
 struct bpf_tracer *get_profiler_tracer(void);
 void set_enable_perf_sample(struct bpf_tracer *t, u64 enable_flag);
+void cpdbg_process(stack_trace_msg_t * msg);
 #endif /* DF_USER_PERF_PROFILER_H */
diff --git a/agent/src/ebpf/user/profile/profile_common.c b/agent/src/ebpf/user/profile/profile_common.c
index 7f3b944d6b1..7880085d6e0 100644
--- a/agent/src/ebpf/user/profile/profile_common.c
+++ b/agent/src/ebpf/user/profile/profile_common.c
@@ -64,6 +64,8 @@
 /* use for java symbols generate */
 #include "deepflow_jattach_bin.c"
 
+extern struct bpf_tracer *profiler_tracer;
+
 extern int major, minor;
 
 static bool java_installed;
@@ -94,8 +96,7 @@ void set_enable_profiler(struct bpf_tracer *t, struct profiler_context *ctx,
 				ENABLE_IDX, &enable_flag) == false) {
 		ebpf_warning("profiler state map update error."
 			     "(%s enable_flag %lu) - %s\n",
-			     MAP_PROFILER_STATE_MAP,
-			     enable_flag, strerror(errno));
+			     ctx->state_map_name, enable_flag, strerror(errno));
 	}
 
 	ctx->enable_bpf_profile = enable_flag;
@@ -251,3 +252,355 @@ int java_libs_and_tools_install(void)
 
 	return (0);
 }
+
+static u32 delete_all_stackmap_elems(struct bpf_tracer *tracer,
+				     const char *stack_map_name)
+{
+	struct ebpf_map *map =
+	    ebpf_obj__get_map_by_name(tracer->obj, stack_map_name);
+	if (map == NULL) {
+		ebpf_warning("[%s] map(name:%s) is NULL.\n", __func__,
+			     stack_map_name);
+		return 0;
+	}
+	int map_fd = map->fd;
+
+	u32 key = 0, next_key;
+	u32 reclaim_count = 0;
+	u32 find_count = 0;
+	struct list_head clear_elem_head;
+	init_list_head(&clear_elem_head);
+
+	while (bpf_get_next_key(map_fd, &key, &next_key) == 0) {
+		find_count++;
+		insert_list(&next_key, sizeof(next_key), &clear_elem_head);
+		key = next_key;
+	}
+
+	reclaim_count = __reclaim_map(map_fd, &clear_elem_head);
+
+	ebpf_info("[%s] table %s find_count %u reclaim_count :%u\n",
+		  __func__, stack_map_name, find_count, reclaim_count);
+
+	return reclaim_count;
+}
+
+static void cleanup_stackmap(struct profiler_context *ctx, struct bpf_tracer *t,
+			     const char *stack_map_name, bool is_a)
+{
+	struct stack_ids_bitmap *ids;
+	int *clear_stack_ids;
+	u64 *perf_buf_lost_p = NULL;
+
+	if (is_a) {
+		ids = &ctx->stack_ids_a;
+		clear_stack_ids = ctx->clear_stack_ids_a;
+		perf_buf_lost_p = &ctx->perf_buf_lost_a_count;
+	} else {
+		ids = &ctx->stack_ids_b;
+		clear_stack_ids = ctx->clear_stack_ids_b;
+		perf_buf_lost_p = &ctx->perf_buf_lost_b_count;
+	}
+
+	if (ids->count != vec_len(clear_stack_ids)) {
+		ebpf_warning
+		    ("stack_ids.count(%lu) != vec_len(clear_stack_ids)(%d)",
+		     ids->count, vec_len(clear_stack_ids));
+	}
+
+	/*
+	 * The perf profiler utilizes a perf buffer (per CPUs) for transporting stack data,
+	 * which may lead to out-of-order behavior in a multi-core environment.
+	 * We have employed a threshold to delay the cleanup of the stack map, reducing the
+	 * occurrence of premature clearing of stack entries caused by the disorder in stack
+	 * data.
+	 *
+	 * Examine the detailed explanation of 'STACKMAP_CLEANUP_THRESHOLD' in
+	 * 'agent/src/ebpf/user/config.h'.
+	 */
+	if (ids->count >= STACKMAP_CLEANUP_THRESHOLD) {
+		int *sid;
+		vec_foreach(sid, clear_stack_ids) {
+			int id = *sid;
+			if (!bpf_table_delete_key(t, stack_map_name, (u64) id)) {
+				/*
+				 * It may be due to the disorder in the perf buffer transmission,
+				 * leading to the repetitive deletion of the same stack ID.
+				 */
+				ctx->stackmap_clear_failed_count++;
+			}
+
+			clear_bitmap(ids->bitmap, id);
+		}
+
+		if (is_a)
+			vec_free(ctx->clear_stack_ids_a);
+		else
+			vec_free(ctx->clear_stack_ids_b);
+
+		ids->count = 0;
+
+		/*
+		 * If data loss occurs due to the user-space receiver program
+		 * being too busy and not promptly fetching data from the perf
+		 * buffer, it is necessary to clean the stack map once to prevent
+		 * excessive remnants of stack data from affecting the acquisition
+		 * of new stack data (i.e., eBPF using the bpf_get_stackid()
+		 * interface will return -EEXIST).
+		 */
+		if (*perf_buf_lost_p > 0) {
+			delete_all_stackmap_elems(t, stack_map_name);
+			*perf_buf_lost_p = 0;
+		}
+	}
+}
+
+static void print_profiler_status(struct profiler_context *ctx,
+				  struct bpf_tracer *t, u64 iter_count)
+{
+	u64 alloc_b, free_b;
+	get_mem_stat(&alloc_b, &free_b);
+	ebpf_debug("\n\n----------------------------\nrecv envent:\t%lu\n"
+		   "kern_lost:\t%lu, perf_buf_lost_a:\t%lu, perf_buf_lost_b:\t%lu\n"
+		   "stack_trace_err:\t%lu\n"
+		   "stackmap_clear_failed_count\t%lu\n"
+		   "ransfer_count:\t%lu iter_count:\t%lu\nall"
+		   "oc_b:\t%lu bytes free_b:\t%lu bytes use:\t%lu bytes\n"
+		   "stack_str_hash.hit_count %lu\nstack_trace_msg_hash hit %lu\n",
+		   atomic64_read(&t->recv), atomic64_read(&t->lost),
+		   ctx->perf_buf_lost_a_count, ctx->perf_buf_lost_b_count,
+		   ctx->stack_trace_err, ctx->stackmap_clear_failed_count,
+		   ctx->transfer_count, iter_count,
+		   alloc_b, free_b, alloc_b - free_b,
+		   ctx->stack_str_hash.hit_hash_count,
+		   ctx->msg_hash.hit_hash_count);
+}
+
+static int push_and_free_msg_kvp_cb(stack_trace_msg_hash_kv * kv, void *arg)
+{
+	struct profiler_context *ctx = arg;
+	stack_trace_msg_kv_t *msg_kv = (stack_trace_msg_kv_t *) kv;
+	if (msg_kv->msg_ptr != 0) {
+		stack_trace_msg_t *msg = (stack_trace_msg_t *) msg_kv->msg_ptr;
+
+		/* continuous profiler debug */
+		cpdbg_process(msg);
+
+		tracer_callback_t fun = profiler_tracer->process_fn;
+		/*
+		 * Execute callback function to hand over the data to the
+		 * higher level for processing. The higher level will se-
+		 * nd the data to the server for storage as required.
+		 */
+		if (likely(ctx->profiler_stop == 0))
+			fun(msg);
+
+		clib_mem_free((void *)msg);
+		msg_kv->msg_ptr = 0;
+	}
+
+	int ret = VEC_OK;
+	vec_add1(ctx->trace_msg_kvps, *kv, ret);
+	if (ret != VEC_OK) {
+		ebpf_warning("vec add failed\n");
+		ctx->msg_clear_hash = true;
+	}
+
+	return BIHASH_WALK_CONTINUE;
+}
+
+/*
+ * Push the data and release the resources.
+ * @is_force: Do you need to perform a forced release?
+ */
+void push_and_release_stack_trace_msg(struct profiler_context *ctx,
+				      stack_trace_msg_hash_t * h, bool is_force)
+{
+	ASSERT(profiler_tracer != NULL);
+
+	u64 curr_time, elapsed;
+	curr_time = gettime(CLOCK_MONOTONIC, TIME_TYPE_NAN);
+	elapsed = curr_time - ctx->last_push_time;
+
+	/*
+	 * If the aggregated stack trace data obtained by the profiler
+	 * satisfies one of the following conditions, it should be pushed
+	 * to the upper-level processing:
+	 *
+	 *   If the time interval since the last push exceeds or equals
+	 *   the maximum time interval (MAX_PUSH_MSG_TIME_INTERVAL).
+	 *
+	 * Otherwise, it should return directly.
+	 */
+	if (!((elapsed >= MAX_PUSH_MSG_TIME_INTERVAL) || is_force))
+		return;
+
+	/* update last push time. */
+	ctx->last_push_time = curr_time;
+	ctx->push_count++;
+
+	stack_trace_msg_hash_foreach_key_value_pair(h, push_and_free_msg_kvp_cb,
+						    (void *)ctx);
+	/*
+	 * In this iteration, all elements will be cleared, and in the
+	 * next iteration, this hash will be reused.
+	 */
+	stack_trace_msg_hash_kv *v;
+	vec_foreach(v, ctx->trace_msg_kvps) {
+		if (stack_trace_msg_hash_add_del(h, v, 0 /* delete */ )) {
+			ebpf_warning
+			    ("stack_trace_msg_hash_add_del() failed.\n");
+			ctx->msg_clear_hash = true;
+		}
+	}
+
+	vec_free(ctx->trace_msg_kvps);
+
+	h->hit_hash_count = 0;
+	h->hash_elems_count = 0;
+
+	if (ctx->msg_clear_hash) {
+		ctx->msg_clear_hash = false;
+		stack_trace_msg_hash_free(h);
+	}
+}
+
+static int init_stack_trace_msg_hash(stack_trace_msg_hash_t * h,
+				     const char *name)
+{
+	memset(h, 0, sizeof(*h));
+	u32 nbuckets = STACK_TRACE_MSG_HASH_BUCKETS_NUM;
+	u64 hash_memory_size = STACK_TRACE_MSG_HASH_MEM_SZ;
+	return stack_trace_msg_hash_init(h, (char *)name,
+					 nbuckets, hash_memory_size);
+}
+
+void process_bpf_stacktraces(struct profiler_context *ctx, struct bpf_tracer *t)
+{
+	struct bpf_perf_reader *r;
+	const char *stack_map_name;
+	bool using_map_set_a = (ctx->transfer_count % 2 == 0);
+	r = using_map_set_a ? ctx->r_a : ctx->r_b;
+	stack_map_name =
+	    using_map_set_a ? ctx->stack_map_name_a : ctx->stack_map_name_b;
+	const u64 sample_count_idx =
+	    using_map_set_a ? SAMPLE_CNT_A_IDX : SAMPLE_CNT_B_IDX;
+
+	struct epoll_event events[r->readers_count];
+	int nfds = reader_epoll_wait(r, events, 0);
+
+	ctx->transfer_count++;
+	if (bpf_table_set_value(t, ctx->state_map_name,
+				TRANSFER_CNT_IDX,
+				&ctx->transfer_count) == false) {
+		ebpf_warning("profiler state map update error."
+			     "(%s transfer_count %lu) - %s\n",
+			     ctx->state_map_name, ctx->transfer_count,
+			     strerror(errno));
+		ctx->transfer_count--;
+	}
+
+	/* Total iteration count for this iteration. */
+	u32 count = 0;
+
+	/* eBPF map record count for this iteration. */
+	u64 sample_cnt_val = 0;
+
+	/*
+	 * Why use g_stack_str_hash?
+	 *
+	 * When the stringizer encounters a stack-ID for the first time in
+	 * the stack trace table, it clears it. If a stack-ID is reused by
+	 * different stack trace keys, the stringizer returns its memoized
+	 * stack trace string. Since stack IDs are unstable between profile
+	 * iterations, we create and destroy the stringizer in each profile
+	 * iteration.
+	 */
+	if (unlikely(ctx->stack_str_hash.buckets == NULL)) {
+		if (init_stack_str_hash
+		    (&ctx->stack_str_hash, "profile_stack_str")) {
+			ebpf_warning("init_stack_str_hash() failed.\n");
+			return;
+		}
+	}
+
+	/*
+	 * During each transmission iteration, we have a hashmap structure in
+	 * place for the following purposes:
+	 *
+	 * 1 Pushing the data of this iteration to the higher-level processing.
+	 * 2 Performing data statistics based on the stack trace data, using the
+	 *   combination of "tgid + tgid_start_time + pid + cpu + k_stack_id +
+	 *   u_stack_id + " as the key.
+	 *
+	 * Here is the key-value pair structure of the hashmap:
+	 * see perf_profiler.h (stack_trace_msg_kv_t)
+	 * This is the final form of the data. If the current stack trace message
+	 * is a match, we only need to increment the count field in the correspon-
+	 * ding value, thus avoiding duplicate parsing.
+	 */
+	if (unlikely(ctx->msg_hash.buckets == NULL)) {
+		if (init_stack_trace_msg_hash
+		    (&ctx->msg_hash, "stack_trace_msg")) {
+			ebpf_warning("init_stack_trace_msg_hash() failed.\n");
+			return;
+		}
+	}
+
+	if (nfds > 0) {
+
+	      check_again:
+		if (unlikely(ctx->profiler_stop == 1))
+			goto release_iter;
+
+		/* 
+		 * If there is data, the reader's callback
+		 * function will be called.
+		 */
+		reader_event_read(events, nfds);
+
+		/*
+		 * After the reader completes data reading, the work of
+		 * data aggregation will be blocked if there is no data.
+		 */
+		aggregate_stack_traces(ctx, t, stack_map_name,
+				       &ctx->stack_str_hash, &ctx->msg_hash,
+				       &count, using_map_set_a);
+
+		/*
+		 * To ensure that all data in the perf ring-buffer is procenssed
+		 * in this iteration, as this iteration will clean up all the
+		 * data recorded in the stackmap, any residual data in the perf
+		 * ring-buffer will be carried over to the next iteration for
+		 * processing. This poses a risk of not being able to find the
+		 * corresponding stackmap records in the next iteration, leading
+		 * to incomplete processing.
+		 */
+		if (bpf_table_get_value(t, ctx->state_map_name,
+					sample_count_idx,
+					(void *)&sample_cnt_val)) {
+			if (sample_cnt_val > count) {
+				nfds = reader_epoll_short_wait(r, events, 0);
+				if (nfds > 0)
+					goto check_again;
+			}
+		}
+	}
+
+release_iter:
+
+	cleanup_stackmap(ctx, t, stack_map_name, using_map_set_a);
+
+	/* Now that we've consumed the data, reset the sample count in BPF. */
+	sample_cnt_val = 0;
+	bpf_table_set_value(t, ctx->state_map_name,
+			    sample_count_idx, &sample_cnt_val);
+
+	print_profiler_status(ctx, t, count);
+
+	/* free all elems */
+	clean_stack_strs(&ctx->stack_str_hash);
+
+	/* Push messages and free stack_trace_msg_hash */
+	push_and_release_stack_trace_msg(ctx, &ctx->msg_hash, false);
+}
diff --git a/agent/src/ebpf/user/profile/profile_common.h b/agent/src/ebpf/user/profile/profile_common.h
index 6a8ce05439a..cda1b6421e1 100644
--- a/agent/src/ebpf/user/profile/profile_common.h
+++ b/agent/src/ebpf/user/profile/profile_common.h
@@ -85,6 +85,10 @@ struct profiler_context {
 	int *clear_stack_ids_a;
 	int *clear_stack_ids_b;
 
+	// for stack_trace_msg_hash relese
+	stack_trace_msg_hash_kv *trace_msg_kvps;
+	bool msg_clear_hash;
+
 	/* profiler statistics */
 
 	// Switching between dual buffers.
@@ -101,8 +105,19 @@ struct profiler_context {
 	 * is used to count the number of lost processes during the parsing process.
 	 */
 	atomic64_t process_lost_count;
+	// Stack error quantity statistics obtained by eBPF.
+	u64 stack_trace_err;
+	// Quantity statistics of data pushed.
+	u64 push_count;
+	/*
+	 * Record the time of the last data push
+	 * (in seconds since system startup)
+	 */
+	u64 last_push_time;
 };
 
+void process_bpf_stacktraces(struct profiler_context *ctx,
+			     struct bpf_tracer *t);
 int do_profiler_regex_config(const char *pattern, struct profiler_context *ctx);
 void set_enable_profiler(struct bpf_tracer *t, struct profiler_context *ctx,
 			 u64 enable_flag);
@@ -112,4 +127,7 @@ int profiler_context_init(struct profiler_context *ctx,
 			  const char *stack_map_name_b);
 bool run_conditions_check(void);
 int java_libs_and_tools_install(void);
+void push_and_release_stack_trace_msg(struct profiler_context *ctx,
+				      stack_trace_msg_hash_t * h,
+				      bool is_force);
 #endif /*DF_USER_PROFILE_COMMON_H */