Skip to content

Commit 8c9b6a8

Browse files
committed
x86: improve on the non-rep 'clear_user' function
The old version was oddly written to have the repeat count in multiple registers. So instead of taking advantage of %rax being zero, it had some sub-counts in it. All just for a "single word clearing" loop, which isn't even efficient to begin with. So get rid of those games, and just keep all the state in the same registers we got it in (and that we should return things in). That not only makes this act much more like 'rep stos' (which this function is replacing), but makes it much easier to actually do the obvious loop unrolling. Also rename the function from the now nonsensical 'clear_user_original' to what it now clearly is: 'rep_stos_alternative'. End result: if we don't have a fast 'rep stosb', at least we can have a fast fallback for it. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 577e6a7 commit 8c9b6a8

File tree

3 files changed

+73
-47
lines changed

3 files changed

+73
-47
lines changed

arch/x86/include/asm/uaccess_64.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
8383
*/
8484

8585
__must_check unsigned long
86-
clear_user_original(void __user *addr, unsigned long len);
86+
rep_stos_alternative(void __user *addr, unsigned long len);
8787

8888
static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
8989
{
@@ -97,7 +97,7 @@ static __always_inline __must_check unsigned long __clear_user(void __user *addr
9797
asm volatile(
9898
"1:\n\t"
9999
ALTERNATIVE("rep stosb",
100-
"call clear_user_original", ALT_NOT(X86_FEATURE_FSRS))
100+
"call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS))
101101
"2:\n"
102102
_ASM_EXTABLE_UA(1b, 2b)
103103
: "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT

arch/x86/lib/clear_page_64.S

Lines changed: 70 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -57,59 +57,85 @@ EXPORT_SYMBOL_GPL(clear_page_erms)
5757
* Input:
5858
* rdi destination
5959
* rcx count
60+
* rax is zero
6061
*
6162
* Output:
6263
* rcx: uncleared bytes or 0 if successful.
6364
*/
64-
SYM_FUNC_START(clear_user_original)
65-
/*
66-
* Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
67-
* i.e., no need for a 'q' suffix and thus a REX prefix.
68-
*/
69-
mov %ecx,%eax
70-
shr $3,%rcx
71-
jz .Lrest_bytes
65+
SYM_FUNC_START(rep_stos_alternative)
66+
cmpq $64,%rcx
67+
jae .Lunrolled
7268

73-
# do the qwords first
74-
.p2align 4
75-
.Lqwords:
76-
movq $0,(%rdi)
77-
lea 8(%rdi),%rdi
78-
dec %rcx
79-
jnz .Lqwords
69+
cmp $8,%ecx
70+
jae .Lword
8071

81-
.Lrest_bytes:
82-
and $7, %eax
83-
jz .Lexit
72+
testl %ecx,%ecx
73+
je .Lexit
8474

85-
# now do the rest bytes
86-
.Lbytes:
87-
movb $0,(%rdi)
75+
.Lclear_user_tail:
76+
0: movb %al,(%rdi)
8877
inc %rdi
89-
dec %eax
90-
jnz .Lbytes
91-
78+
dec %rcx
79+
jnz .Lclear_user_tail
9280
.Lexit:
93-
/*
94-
* %rax still needs to be cleared in the exception case because this function is called
95-
* from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
96-
* in case it might reuse it somewhere.
97-
*/
98-
xor %eax,%eax
99-
RET
81+
RET
10082

101-
.Lqwords_exception:
102-
# convert remaining qwords back into bytes to return to caller
103-
shl $3, %rcx
104-
and $7, %eax
105-
add %rax,%rcx
106-
jmp .Lexit
83+
_ASM_EXTABLE_UA( 0b, .Lexit)
10784

108-
.Lbytes_exception:
109-
mov %eax,%ecx
110-
jmp .Lexit
85+
.Lword:
86+
1: movq %rax,(%rdi)
87+
addq $8,%rdi
88+
sub $8,%ecx
89+
je .Lexit
90+
cmp $8,%ecx
91+
jae .Lword
92+
jmp .Lclear_user_tail
11193

112-
_ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
113-
_ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
114-
SYM_FUNC_END(clear_user_original)
115-
EXPORT_SYMBOL(clear_user_original)
94+
.p2align 4
95+
.Lunrolled:
96+
10: movq %rax,(%rdi)
97+
11: movq %rax,8(%rdi)
98+
12: movq %rax,16(%rdi)
99+
13: movq %rax,24(%rdi)
100+
14: movq %rax,32(%rdi)
101+
15: movq %rax,40(%rdi)
102+
16: movq %rax,48(%rdi)
103+
17: movq %rax,56(%rdi)
104+
addq $64,%rdi
105+
subq $64,%rcx
106+
cmpq $64,%rcx
107+
jae .Lunrolled
108+
cmpl $8,%ecx
109+
jae .Lword
110+
testl %ecx,%ecx
111+
jne .Lclear_user_tail
112+
RET
113+
114+
/*
115+
* If we take an exception on any of the
116+
* word stores, we know that %rcx isn't zero,
117+
* so we can just go to the tail clearing to
118+
* get the exact count.
119+
*
120+
* The unrolled case might end up clearing
121+
* some bytes twice. Don't care.
122+
*
123+
* We could use the value in %rdi to avoid
124+
* a second fault on the exact count case,
125+
* but do we really care? No.
126+
*
127+
* Finally, we could try to align %rdi at the
128+
* top of the unrolling. But unaligned stores
129+
* just aren't that common or expensive.
130+
*/
131+
_ASM_EXTABLE_UA( 1b, .Lclear_user_tail)
132+
_ASM_EXTABLE_UA(10b, .Lclear_user_tail)
133+
_ASM_EXTABLE_UA(11b, .Lclear_user_tail)
134+
_ASM_EXTABLE_UA(12b, .Lclear_user_tail)
135+
_ASM_EXTABLE_UA(13b, .Lclear_user_tail)
136+
_ASM_EXTABLE_UA(14b, .Lclear_user_tail)
137+
_ASM_EXTABLE_UA(15b, .Lclear_user_tail)
138+
_ASM_EXTABLE_UA(16b, .Lclear_user_tail)
139+
_ASM_EXTABLE_UA(17b, .Lclear_user_tail)
140+
SYM_FUNC_END(rep_stos_alternative)
141+
EXPORT_SYMBOL(rep_stos_alternative)

tools/objtool/check.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1284,7 +1284,7 @@ static const char *uaccess_safe_builtin[] = {
12841284
"copy_mc_fragile_handle_tail",
12851285
"copy_mc_enhanced_fast_string",
12861286
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
1287-
"clear_user_original",
1287+
"rep_stos_alternative",
12881288
"copy_user_generic_unrolled",
12891289
"__copy_user_nocache",
12901290
NULL

0 commit comments

Comments
 (0)