Skip to content

Commit b571104

Browse files
committed
perf lock contention: Use per-cpu array map for spinlocks
Currently lock contention timestamp is maintained in a hash map keyed by pid. That means it needs to get and release a map element (which is proctected by spinlock!) on each contention begin and end pair. This can impact on performance if there are a lot of contention (usually from spinlocks). It used to go with task local storage but it had an issue on memory allocation in some critical paths. Although it's addressed in recent kernels IIUC, the tool should support old kernels too. So it cannot simply switch to the task local storage at least for now. As spinlocks create lots of contention and they disabled preemption during the spinning, it can use per-cpu array to keep the timestamp to avoid overhead in hashmap update and delete. In contention_begin, it's easy to check the lock types since it can see the flags. But contention_end cannot see it. So let's try to per-cpu array first (unconditionally) if it has an active element (lock != 0). Then it should be used and per-task tstamp map should not be used until the per-cpu array element is cleared which means nested spinlock contention (if any) was finished and it nows see (the outer) lock. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Acked-by: Ian Rogers <irogers@google.com> Cc: Hao Luo <haoluo@google.com> Cc: Song Liu <song@kernel.org> Cc: bpf@vger.kernel.org Link: https://lore.kernel.org/r/20231020204741.1869520-3-namhyung@kernel.org
1 parent 6a07057 commit b571104

File tree

1 file changed

+72
-17
lines changed

1 file changed

+72
-17
lines changed

tools/perf/util/bpf_skel/lock_contention.bpf.c

Lines changed: 72 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ struct {
4242
__uint(max_entries, MAX_ENTRIES);
4343
} tstamp SEC(".maps");
4444

45+
/* maintain per-CPU timestamp at the beginning of contention */
46+
struct {
47+
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
48+
__uint(key_size, sizeof(__u32));
49+
__uint(value_size, sizeof(struct tstamp_data));
50+
__uint(max_entries, 1);
51+
} tstamp_cpu SEC(".maps");
52+
4553
/* actual lock contention statistics */
4654
struct {
4755
__uint(type, BPF_MAP_TYPE_HASH);
@@ -311,34 +319,57 @@ static inline __u32 check_lock_type(__u64 lock, __u32 flags)
311319
return 0;
312320
}
313321

314-
SEC("tp_btf/contention_begin")
315-
int contention_begin(u64 *ctx)
322+
static inline struct tstamp_data *get_tstamp_elem(__u32 flags)
316323
{
317324
__u32 pid;
318325
struct tstamp_data *pelem;
319326

320-
if (!enabled || !can_record(ctx))
321-
return 0;
327+
/* Use per-cpu array map for spinlock and rwlock */
328+
if (flags == (LCB_F_SPIN | LCB_F_READ) || flags == LCB_F_SPIN ||
329+
flags == (LCB_F_SPIN | LCB_F_WRITE)) {
330+
__u32 idx = 0;
331+
332+
pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
333+
/* Do not update the element for nested locks */
334+
if (pelem && pelem->lock)
335+
pelem = NULL;
336+
return pelem;
337+
}
322338

323339
pid = bpf_get_current_pid_tgid();
324340
pelem = bpf_map_lookup_elem(&tstamp, &pid);
341+
/* Do not update the element for nested locks */
325342
if (pelem && pelem->lock)
326-
return 0;
343+
return NULL;
327344

328345
if (pelem == NULL) {
329346
struct tstamp_data zero = {};
330347

331348
if (bpf_map_update_elem(&tstamp, &pid, &zero, BPF_NOEXIST) < 0) {
332349
__sync_fetch_and_add(&task_fail, 1);
333-
return 0;
350+
return NULL;
334351
}
335352

336353
pelem = bpf_map_lookup_elem(&tstamp, &pid);
337354
if (pelem == NULL) {
338355
__sync_fetch_and_add(&task_fail, 1);
339-
return 0;
356+
return NULL;
340357
}
341358
}
359+
return pelem;
360+
}
361+
362+
SEC("tp_btf/contention_begin")
363+
int contention_begin(u64 *ctx)
364+
{
365+
struct tstamp_data *pelem;
366+
367+
if (!enabled || !can_record(ctx))
368+
return 0;
369+
370+
pelem = get_tstamp_elem(ctx[1]);
371+
if (pelem == NULL)
372+
return 0;
342373

343374
pelem->timestamp = bpf_ktime_get_ns();
344375
pelem->lock = (__u64)ctx[0];
@@ -377,24 +408,42 @@ int contention_begin(u64 *ctx)
377408
SEC("tp_btf/contention_end")
378409
int contention_end(u64 *ctx)
379410
{
380-
__u32 pid;
411+
__u32 pid = 0, idx = 0;
381412
struct tstamp_data *pelem;
382413
struct contention_key key = {};
383414
struct contention_data *data;
384415
__u64 duration;
416+
bool need_delete = false;
385417

386418
if (!enabled)
387419
return 0;
388420

389-
pid = bpf_get_current_pid_tgid();
390-
pelem = bpf_map_lookup_elem(&tstamp, &pid);
391-
if (!pelem || pelem->lock != ctx[0])
392-
return 0;
421+
/*
422+
* For spinlock and rwlock, it needs to get the timestamp for the
423+
* per-cpu map. However, contention_end does not have the flags
424+
* so it cannot know whether it reads percpu or hash map.
425+
*
426+
* Try per-cpu map first and check if there's active contention.
427+
* If it is, do not read hash map because it cannot go to sleeping
428+
* locks before releasing the spinning locks.
429+
*/
430+
pelem = bpf_map_lookup_elem(&tstamp_cpu, &idx);
431+
if (pelem && pelem->lock) {
432+
if (pelem->lock != ctx[0])
433+
return 0;
434+
} else {
435+
pid = bpf_get_current_pid_tgid();
436+
pelem = bpf_map_lookup_elem(&tstamp, &pid);
437+
if (!pelem || pelem->lock != ctx[0])
438+
return 0;
439+
need_delete = true;
440+
}
393441

394442
duration = bpf_ktime_get_ns() - pelem->timestamp;
395443
if ((__s64)duration < 0) {
396444
pelem->lock = 0;
397-
bpf_map_delete_elem(&tstamp, &pid);
445+
if (need_delete)
446+
bpf_map_delete_elem(&tstamp, &pid);
398447
__sync_fetch_and_add(&time_fail, 1);
399448
return 0;
400449
}
@@ -406,8 +455,11 @@ int contention_end(u64 *ctx)
406455
case LOCK_AGGR_TASK:
407456
if (lock_owner)
408457
key.pid = pelem->flags;
409-
else
458+
else {
459+
if (!need_delete)
460+
pid = bpf_get_current_pid_tgid();
410461
key.pid = pid;
462+
}
411463
if (needs_callstack)
412464
key.stack_id = pelem->stack_id;
413465
break;
@@ -428,7 +480,8 @@ int contention_end(u64 *ctx)
428480
if (!data) {
429481
if (data_map_full) {
430482
pelem->lock = 0;
431-
bpf_map_delete_elem(&tstamp, &pid);
483+
if (need_delete)
484+
bpf_map_delete_elem(&tstamp, &pid);
432485
__sync_fetch_and_add(&data_fail, 1);
433486
return 0;
434487
}
@@ -452,7 +505,8 @@ int contention_end(u64 *ctx)
452505
__sync_fetch_and_add(&data_fail, 1);
453506
}
454507
pelem->lock = 0;
455-
bpf_map_delete_elem(&tstamp, &pid);
508+
if (need_delete)
509+
bpf_map_delete_elem(&tstamp, &pid);
456510
return 0;
457511
}
458512

@@ -466,7 +520,8 @@ int contention_end(u64 *ctx)
466520
data->min_time = duration;
467521

468522
pelem->lock = 0;
469-
bpf_map_delete_elem(&tstamp, &pid);
523+
if (need_delete)
524+
bpf_map_delete_elem(&tstamp, &pid);
470525
return 0;
471526
}
472527

0 commit comments

Comments
 (0)