Skip to content

Commit 577e6a7

Browse files
committed
x86: inline the 'rep movs' in user copies for the FSRM case
This does the same thing for the user copies as commit 0db7058 ("x86/clear_user: Make it faster") did for clear_user(). In other words, it inlines the "rep movs" case when X86_FEATURE_FSRM is set, avoiding the function call entirely. In order to do that, it makes the calling convention for the out-of-line case ("copy_user_generic_unrolled") match the 'rep movs' calling convention, although it does also end up clobbering a number of additional registers. Also, to simplify code sharing in the low-level assembly with the __copy_user_nocache() function (that uses the normal C calling convention), we end up with a kind of mixed return value for the low-level asm code: it will return the result in both %rcx (to work as an alternative for the 'rep movs' case), _and_ in %rax (for the nocache case). We could avoid this by wrapping __copy_user_nocache() callers in an inline asm, but since the cost is just an extra register copy, it's probably not worth it. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 3639a53 commit 577e6a7

File tree

3 files changed

+31
-48
lines changed

3 files changed

+31
-48
lines changed

arch/x86/include/asm/uaccess_64.h

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,29 +18,26 @@
1818

1919
/* Handles exceptions in both to and from, but doesn't do access_ok */
2020
__must_check unsigned long
21-
copy_user_fast_string(void *to, const void *from, unsigned len);
22-
__must_check unsigned long
2321
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
2422

2523
static __always_inline __must_check unsigned long
26-
copy_user_generic(void *to, const void *from, unsigned len)
24+
copy_user_generic(void *to, const void *from, unsigned long len)
2725
{
28-
unsigned ret;
29-
3026
stac();
3127
/*
3228
* If CPU has FSRM feature, use 'rep movs'.
3329
* Otherwise, use copy_user_generic_unrolled.
3430
*/
35-
alternative_call(copy_user_generic_unrolled,
36-
copy_user_fast_string,
37-
X86_FEATURE_FSRM,
38-
ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
39-
"=d" (len)),
40-
"1" (to), "2" (from), "3" (len)
41-
: "memory", "rcx", "r8", "r9", "r10", "r11");
31+
asm volatile(
32+
"1:\n\t"
33+
ALTERNATIVE("rep movsb",
34+
"call copy_user_generic_unrolled", ALT_NOT(X86_FEATURE_FSRM))
35+
"2:\n"
36+
_ASM_EXTABLE_UA(1b, 2b)
37+
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
38+
: : "memory", "rax", "rdx", "r8", "r9", "r10", "r11");
4239
clac();
43-
return ret;
40+
return len;
4441
}
4542

4643
static __always_inline __must_check unsigned long

arch/x86/lib/copy_user_64.S

Lines changed: 21 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,29 @@
4545
* Input:
4646
* rdi destination
4747
* rsi source
48-
* rdx count
48+
* rcx count
4949
*
5050
* Output:
51-
* eax uncopied bytes or 0 if successful.
51+
* rcx uncopied bytes or 0 if successful.
52+
*
53+
* NOTE! The calling convention is very intentionally the same as
54+
* for 'rep movs', so that we can rewrite the function call with
55+
* just a plain 'rep movs' on machines that have FSRM.
56+
*
57+
* HOWEVER! This function ends up having a lot of the code common
58+
* with __copy_user_nocache(), which is a normal C function, and
59+
* has a similar calling convention, but gets the 'count' in %rdx,
60+
* and returns the result in %rax.
61+
*
62+
* To share as much code as possible, we end up returning the
63+
* result in *both* %rcx/%rax, and we also move the initial count
64+
* into %rdx.
65+
*
66+
* We can clobber rdx/rsi/rdi and r8-r11
5267
*/
5368
SYM_FUNC_START(copy_user_generic_unrolled)
54-
cmpl $8,%edx
69+
movl %ecx,%edx
70+
cmpl $8,%ecx
5571
jb .Lcopy_user_short_string_bytes
5672
ALIGN_DESTINATION
5773
movl %edx,%ecx
@@ -103,37 +119,6 @@ SYM_FUNC_START(copy_user_generic_unrolled)
103119
SYM_FUNC_END(copy_user_generic_unrolled)
104120
EXPORT_SYMBOL(copy_user_generic_unrolled)
105121

106-
/*
107-
* Some CPUs support FSRM for Fast Short REP MOVS.
108-
*
109-
* Only 4GB of copy is supported. This shouldn't be a problem
110-
* because the kernel normally only writes from/to page sized chunks
111-
* even if user space passed a longer buffer.
112-
* And more would be dangerous because both Intel and AMD have
113-
* errata with rep movsq > 4GB. If someone feels the need to fix
114-
* this please consider this.
115-
*
116-
* Input:
117-
* rdi destination
118-
* rsi source
119-
* rdx count
120-
*
121-
* Output:
122-
* eax uncopied bytes or 0 if successful.
123-
*/
124-
SYM_FUNC_START(copy_user_fast_string)
125-
movl %edx,%ecx
126-
1: rep movsb
127-
xorl %eax,%eax
128-
RET
129-
130-
12: movl %ecx,%eax /* ecx is zerorest also */
131-
RET
132-
133-
_ASM_EXTABLE_CPY(1b, 12b)
134-
SYM_FUNC_END(copy_user_fast_string)
135-
EXPORT_SYMBOL(copy_user_fast_string)
136-
137122
/*
138123
* Try to copy last bytes and clear the rest if needed.
139124
* Since protection fault in copy_from/to_user is not a normal situation,
@@ -160,6 +145,7 @@ SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
160145

161146
3:
162147
movl %edx,%eax
148+
movl %edx,%ecx
163149
RET
164150

165151
_ASM_EXTABLE_CPY(1b, 2b)
@@ -203,6 +189,7 @@ SYM_CODE_START_LOCAL(copy_user_short_string)
203189
decl %ecx
204190
jnz 21b
205191
23: xor %eax,%eax
192+
xor %ecx,%ecx
206193
RET
207194

208195
40: leal (%rdx,%rcx,8),%edx

tools/objtool/check.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1286,7 +1286,6 @@ static const char *uaccess_safe_builtin[] = {
12861286
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
12871287
"clear_user_original",
12881288
"copy_user_generic_unrolled",
1289-
"copy_user_fast_string",
12901289
"__copy_user_nocache",
12911290
NULL
12921291
};

0 commit comments

Comments
 (0)