Skip to content

Commit ed2f752

Browse files
ubizjakIngo Molnar
authored andcommitted
x86/percpu: Introduce const-qualified const_pcpu_hot to micro-optimize code generation
Some variables in pcpu_hot, currently current_task and top_of_stack are actually per-thread variables implemented as per-CPU variables and thus stable for the duration of the respective task. There is already an attempt to eliminate redundant reads from these variables using this_cpu_read_stable() asm macro, which hides the dependency on the read memory address. However, the compiler has limited ability to eliminate asm common subexpressions, so this approach results in a limited success. The solution is to allow more aggressive elimination by aliasing pcpu_hot into a const-qualified const_pcpu_hot, and to read stable per-CPU variables from this constant copy. The current per-CPU infrastructure does not support reads from const-qualified variables. However, when the compiler supports segment qualifiers, it is possible to declare the const-aliased variable in the relevant named address space. The compiler considers access to the variable, declared in this way, as a read from a constant location, and will optimize reads from the variable accordingly. By implementing constant-qualified const_pcpu_hot, the compiler can eliminate redundant reads from the constant variables, reducing the number of loads from current_task from 3766 to 3217 on a test build, a -14.6% reduction. The reduction of loads translates to the following code savings: text data bss dec hex filename 25,477,353 4389456 808452 30675261 1d4113d vmlinux-old.o 25,476,074 4389440 808452 30673966 1d40c2e vmlinux-new.o representing a code size reduction of -1279 bytes. [ mingo: Updated the changelog, EXPORT(const_pcpu_hot). ] Co-developed-by: Nadav Amit <namit@vmware.com> Signed-off-by: Nadav Amit <namit@vmware.com> Signed-off-by: Uros Bizjak <ubizjak@gmail.com> Signed-off-by: Ingo Molnar <mingo@kernel.org> Link: https://lore.kernel.org/r/20231020162004.135244-1-ubizjak@gmail.com
1 parent 59bec00 commit ed2f752

File tree

6 files changed

+16
-4
lines changed

6 files changed

+16
-4
lines changed

arch/x86/include/asm/current.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,15 @@ static_assert(sizeof(struct pcpu_hot) == 64);
3636

3737
DECLARE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot);
3838

39+
/* const-qualified alias to pcpu_hot, aliased by linker. */
40+
DECLARE_PER_CPU_ALIGNED(const struct pcpu_hot __percpu_seg_override,
41+
const_pcpu_hot);
42+
3943
static __always_inline struct task_struct *get_current(void)
4044
{
45+
if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
46+
return const_pcpu_hot.current_task;
47+
4148
return this_cpu_read_stable(pcpu_hot.current_task);
4249
}
4350

arch/x86/include/asm/percpu.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -413,9 +413,9 @@ do { \
413413
* accessed while this_cpu_read_stable() allows the value to be cached.
414414
* this_cpu_read_stable() is more efficient and can be used if its value
415415
* is guaranteed to be valid across cpus. The current users include
416-
* get_current() and get_thread_info() both of which are actually
417-
* per-thread variables implemented as per-cpu variables and thus
418-
* stable for the duration of the respective task.
416+
* pcpu_hot.current_task and pcpu_hot.top_of_stack, both of which are
417+
* actually per-thread variables implemented as per-CPU variables and
418+
* thus stable for the duration of the respective task.
419419
*/
420420
#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
421421
#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)

arch/x86/include/asm/processor.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,9 @@ static __always_inline unsigned long current_top_of_stack(void)
518518
* and around vm86 mode and sp0 on x86_64 is special because of the
519519
* entry trampoline.
520520
*/
521+
if (IS_ENABLED(CONFIG_USE_X86_SEG_SUPPORT))
522+
return pcpu_hot.top_of_stack;
523+
521524
return this_cpu_read_stable(pcpu_hot.top_of_stack);
522525
}
523526

arch/x86/kernel/cpu/common.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2051,6 +2051,7 @@ DEFINE_PER_CPU_ALIGNED(struct pcpu_hot, pcpu_hot) = {
20512051
.top_of_stack = TOP_OF_INIT_STACK,
20522052
};
20532053
EXPORT_PER_CPU_SYMBOL(pcpu_hot);
2054+
EXPORT_PER_CPU_SYMBOL(const_pcpu_hot);
20542055

20552056
#ifdef CONFIG_X86_64
20562057
DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,

arch/x86/kernel/vmlinux.lds.S

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ ENTRY(phys_startup_64)
4646
#endif
4747

4848
jiffies = jiffies_64;
49+
const_pcpu_hot = pcpu_hot;
4950

5051
#if defined(CONFIG_X86_64)
5152
/*

include/linux/compiler.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val,
212212
*/
213213
#define ___ADDRESSABLE(sym, __attrs) \
214214
static void * __used __attrs \
215-
__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)&sym;
215+
__UNIQUE_ID(__PASTE(__addressable_,sym)) = (void *)(uintptr_t)&sym;
216216
#define __ADDRESSABLE(sym) \
217217
___ADDRESSABLE(sym, __section(".discard.addressable"))
218218

0 commit comments

Comments
 (0)