Skip to content

Commit 427fda2

Browse files
committed
x86: improve on the non-rep 'copy_user' function
The old 'copy_user_generic_unrolled' function was oddly implemented for largely historical reasons: it had been largely based on the uncached copy case, which has some other concerns. For example, the __copy_user_nocache() function uses 'movnti' for the destination stores, and those want the destination to be aligned. In contrast, the regular copy function doesn't really care, and trying to align things only complicates matters. Also, like the clear_user function, the copy function had some odd handling of the repeat counts, complicating the exception handling for no really good reason. So as with clear_user, just write it to keep all the byte counts in the %rcx register, exactly like the 'rep movs' functionality that this replaces. Unlike a real 'rep movs', we do allow for this to trash a few temporary registers to not have to unnecessarily save/restore registers on the stack. And like the clearing case, rename this to what it now clearly is: 'rep_movs_alternative', and make it one coherent function, so that it shows up as such in profiles (instead of the odd split between "copy_user_generic_unrolled" and "copy_user_short_string", the latter of which was not about strings at all, and which was shared with the uncached case). Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 8c9b6a8 commit 427fda2

File tree

3 files changed

+138
-161
lines changed

3 files changed

+138
-161
lines changed

arch/x86/include/asm/uaccess_64.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,24 +18,24 @@
1818

1919
/* Handles exceptions in both to and from, but doesn't do access_ok */
2020
__must_check unsigned long
21-
copy_user_generic_unrolled(void *to, const void *from, unsigned len);
21+
rep_movs_alternative(void *to, const void *from, unsigned len);
2222

2323
static __always_inline __must_check unsigned long
2424
copy_user_generic(void *to, const void *from, unsigned long len)
2525
{
2626
stac();
2727
/*
2828
* If CPU has FSRM feature, use 'rep movs'.
29-
* Otherwise, use copy_user_generic_unrolled.
29+
* Otherwise, use rep_movs_alternative.
3030
*/
3131
asm volatile(
3232
"1:\n\t"
3333
ALTERNATIVE("rep movsb",
34-
"call copy_user_generic_unrolled", ALT_NOT(X86_FEATURE_FSRM))
34+
"call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM))
3535
"2:\n"
3636
_ASM_EXTABLE_UA(1b, 2b)
3737
:"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
38-
: : "memory", "rax", "rdx", "r8", "r9", "r10", "r11");
38+
: : "memory", "rax", "r8", "r9", "r10", "r11");
3939
clac();
4040
return len;
4141
}

arch/x86/lib/copy_user_64.S

Lines changed: 133 additions & 156 deletions
Original file line numberDiff line numberDiff line change
@@ -17,30 +17,9 @@
1717
#include <asm/export.h>
1818
#include <asm/trapnr.h>
1919

20-
.macro ALIGN_DESTINATION
21-
/* check for bad alignment of destination */
22-
movl %edi,%ecx
23-
andl $7,%ecx
24-
jz 102f /* already aligned */
25-
subl $8,%ecx
26-
negl %ecx
27-
subl %ecx,%edx
28-
100: movb (%rsi),%al
29-
101: movb %al,(%rdi)
30-
incq %rsi
31-
incq %rdi
32-
decl %ecx
33-
jnz 100b
34-
102:
35-
36-
_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
37-
_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
38-
.endm
39-
4020
/*
41-
* copy_user_generic_unrolled - memory copy with exception handling.
42-
* This version is for CPUs like P4 that don't have efficient micro
43-
* code for rep movsq
21+
* rep_movs_alternative - memory copy with exception handling.
22+
* This version is for CPUs that don't have FSRM (Fast Short Rep Movs)
4423
*
4524
* Input:
4625
* rdi destination
@@ -52,156 +31,119 @@
5231
*
5332
* NOTE! The calling convention is very intentionally the same as
5433
* for 'rep movs', so that we can rewrite the function call with
55-
* just a plain 'rep movs' on machines that have FSRM.
56-
*
57-
* HOWEVER! This function ends up having a lot of the code common
58-
* with __copy_user_nocache(), which is a normal C function, and
59-
* has a similar calling convention, but gets the 'count' in %rdx,
60-
* and returns the result in %rax.
61-
*
62-
* To share as much code as possible, we end up returning the
63-
* result in *both* %rcx/%rax, and we also move the initial count
64-
* into %rdx.
65-
*
66-
* We can clobber rdx/rsi/rdi and r8-r11
34+
* just a plain 'rep movs' on machines that have FSRM. But to make
35+
* it simpler for us, we can clobber rsi/rdi and rax/r8-r11 freely.
6736
*/
68-
SYM_FUNC_START(copy_user_generic_unrolled)
69-
movl %ecx,%edx
70-
cmpl $8,%ecx
71-
jb .Lcopy_user_short_string_bytes
72-
ALIGN_DESTINATION
73-
movl %edx,%ecx
74-
andl $63,%edx
75-
shrl $6,%ecx
76-
jz copy_user_short_string
77-
1: movq (%rsi),%r8
78-
2: movq 1*8(%rsi),%r9
79-
3: movq 2*8(%rsi),%r10
80-
4: movq 3*8(%rsi),%r11
81-
5: movq %r8,(%rdi)
82-
6: movq %r9,1*8(%rdi)
83-
7: movq %r10,2*8(%rdi)
84-
8: movq %r11,3*8(%rdi)
85-
9: movq 4*8(%rsi),%r8
86-
10: movq 5*8(%rsi),%r9
87-
11: movq 6*8(%rsi),%r10
88-
12: movq 7*8(%rsi),%r11
89-
13: movq %r8,4*8(%rdi)
90-
14: movq %r9,5*8(%rdi)
91-
15: movq %r10,6*8(%rdi)
92-
16: movq %r11,7*8(%rdi)
93-
leaq 64(%rsi),%rsi
94-
leaq 64(%rdi),%rdi
95-
decl %ecx
96-
jnz 1b
97-
jmp copy_user_short_string
37+
SYM_FUNC_START(rep_movs_alternative)
38+
cmpq $64,%rcx
39+
jae .Lunrolled
9840

99-
30: shll $6,%ecx
100-
addl %ecx,%edx
101-
jmp .Lcopy_user_handle_tail
41+
cmp $8,%ecx
42+
jae .Lword
10243

103-
_ASM_EXTABLE_CPY(1b, 30b)
104-
_ASM_EXTABLE_CPY(2b, 30b)
105-
_ASM_EXTABLE_CPY(3b, 30b)
106-
_ASM_EXTABLE_CPY(4b, 30b)
107-
_ASM_EXTABLE_CPY(5b, 30b)
108-
_ASM_EXTABLE_CPY(6b, 30b)
109-
_ASM_EXTABLE_CPY(7b, 30b)
110-
_ASM_EXTABLE_CPY(8b, 30b)
111-
_ASM_EXTABLE_CPY(9b, 30b)
112-
_ASM_EXTABLE_CPY(10b, 30b)
113-
_ASM_EXTABLE_CPY(11b, 30b)
114-
_ASM_EXTABLE_CPY(12b, 30b)
115-
_ASM_EXTABLE_CPY(13b, 30b)
116-
_ASM_EXTABLE_CPY(14b, 30b)
117-
_ASM_EXTABLE_CPY(15b, 30b)
118-
_ASM_EXTABLE_CPY(16b, 30b)
119-
SYM_FUNC_END(copy_user_generic_unrolled)
120-
EXPORT_SYMBOL(copy_user_generic_unrolled)
44+
testl %ecx,%ecx
45+
je .Lexit
12146

122-
/*
123-
* Try to copy last bytes and clear the rest if needed.
124-
* Since protection fault in copy_from/to_user is not a normal situation,
125-
* it is not necessary to optimize tail handling.
126-
* Don't try to copy the tail if machine check happened
127-
*
128-
* Input:
129-
* eax trap number written by ex_handler_copy()
130-
* rdi destination
131-
* rsi source
132-
* rdx count
133-
*
134-
* Output:
135-
* eax uncopied bytes or 0 if successful.
136-
*/
137-
SYM_CODE_START_LOCAL(.Lcopy_user_handle_tail)
138-
cmp $X86_TRAP_MC,%eax
139-
je 3f
140-
141-
movl %edx,%ecx
142-
1: rep movsb
143-
2: mov %ecx,%eax
47+
.Lcopy_user_tail:
48+
0: movb (%rsi),%al
49+
1: movb %al,(%rdi)
50+
inc %rdi
51+
inc %rsi
52+
dec %rcx
53+
jne .Lcopy_user_tail
54+
.Lexit:
14455
RET
14556

146-
3:
147-
movl %edx,%eax
148-
movl %edx,%ecx
149-
RET
57+
_ASM_EXTABLE_UA( 0b, .Lexit)
58+
_ASM_EXTABLE_UA( 1b, .Lexit)
15059

151-
_ASM_EXTABLE_CPY(1b, 2b)
60+
.p2align 4
61+
.Lword:
62+
2: movq (%rsi),%rax
63+
3: movq %rax,(%rdi)
64+
addq $8,%rsi
65+
addq $8,%rdi
66+
sub $8,%ecx
67+
je .Lexit
68+
cmp $8,%ecx
69+
jae .Lword
70+
jmp .Lcopy_user_tail
15271

153-
.Lcopy_user_handle_align:
154-
addl %ecx,%edx /* ecx is zerorest also */
155-
jmp .Lcopy_user_handle_tail
72+
_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
73+
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
15674

157-
SYM_CODE_END(.Lcopy_user_handle_tail)
75+
.p2align 4
76+
.Lunrolled:
77+
10: movq (%rsi),%r8
78+
11: movq 8(%rsi),%r9
79+
12: movq 16(%rsi),%r10
80+
13: movq 24(%rsi),%r11
81+
14: movq %r8,(%rdi)
82+
15: movq %r9,8(%rdi)
83+
16: movq %r10,16(%rdi)
84+
17: movq %r11,24(%rdi)
85+
20: movq 32(%rsi),%r8
86+
21: movq 40(%rsi),%r9
87+
22: movq 48(%rsi),%r10
88+
23: movq 56(%rsi),%r11
89+
24: movq %r8,32(%rdi)
90+
25: movq %r9,40(%rdi)
91+
26: movq %r10,48(%rdi)
92+
27: movq %r11,56(%rdi)
93+
addq $64,%rsi
94+
addq $64,%rdi
95+
subq $64,%rcx
96+
cmpq $64,%rcx
97+
jae .Lunrolled
98+
cmpl $8,%ecx
99+
jae .Lword
100+
testl %ecx,%ecx
101+
jne .Lcopy_user_tail
102+
RET
103+
104+
_ASM_EXTABLE_UA(10b, .Lcopy_user_tail)
105+
_ASM_EXTABLE_UA(11b, .Lcopy_user_tail)
106+
_ASM_EXTABLE_UA(12b, .Lcopy_user_tail)
107+
_ASM_EXTABLE_UA(13b, .Lcopy_user_tail)
108+
_ASM_EXTABLE_UA(14b, .Lcopy_user_tail)
109+
_ASM_EXTABLE_UA(15b, .Lcopy_user_tail)
110+
_ASM_EXTABLE_UA(16b, .Lcopy_user_tail)
111+
_ASM_EXTABLE_UA(17b, .Lcopy_user_tail)
112+
_ASM_EXTABLE_UA(20b, .Lcopy_user_tail)
113+
_ASM_EXTABLE_UA(21b, .Lcopy_user_tail)
114+
_ASM_EXTABLE_UA(22b, .Lcopy_user_tail)
115+
_ASM_EXTABLE_UA(23b, .Lcopy_user_tail)
116+
_ASM_EXTABLE_UA(24b, .Lcopy_user_tail)
117+
_ASM_EXTABLE_UA(25b, .Lcopy_user_tail)
118+
_ASM_EXTABLE_UA(26b, .Lcopy_user_tail)
119+
_ASM_EXTABLE_UA(27b, .Lcopy_user_tail)
120+
SYM_FUNC_END(rep_movs_alternative)
121+
EXPORT_SYMBOL(rep_movs_alternative)
158122

159123
/*
160-
* Finish memcpy of less than 64 bytes. #AC should already be set.
161-
*
162-
* Input:
163-
* rdi destination
164-
* rsi source
165-
* rdx count (< 64)
166-
*
167-
* Output:
168-
* eax uncopied bytes or 0 if successful.
124+
* The uncached copy needs to align the destination for
125+
* movnti and friends.
169126
*/
170-
SYM_CODE_START_LOCAL(copy_user_short_string)
171-
movl %edx,%ecx
172-
andl $7,%edx
173-
shrl $3,%ecx
174-
jz .Lcopy_user_short_string_bytes
175-
18: movq (%rsi),%r8
176-
19: movq %r8,(%rdi)
177-
leaq 8(%rsi),%rsi
178-
leaq 8(%rdi),%rdi
179-
decl %ecx
180-
jnz 18b
181-
.Lcopy_user_short_string_bytes:
182-
andl %edx,%edx
183-
jz 23f
184-
movl %edx,%ecx
185-
21: movb (%rsi),%al
186-
22: movb %al,(%rdi)
127+
.macro ALIGN_DESTINATION
128+
/* check for bad alignment of destination */
129+
movl %edi,%ecx
130+
andl $7,%ecx
131+
jz 102f /* already aligned */
132+
subl $8,%ecx
133+
negl %ecx
134+
subl %ecx,%edx
135+
100: movb (%rsi),%al
136+
101: movb %al,(%rdi)
187137
incq %rsi
188138
incq %rdi
189139
decl %ecx
190-
jnz 21b
191-
23: xor %eax,%eax
192-
xor %ecx,%ecx
193-
RET
140+
jnz 100b
141+
102:
194142

195-
40: leal (%rdx,%rcx,8),%edx
196-
jmp 60f
197-
50: movl %ecx,%edx /* ecx is zerorest also */
198-
60: jmp .Lcopy_user_handle_tail
143+
_ASM_EXTABLE_CPY(100b, .Lcopy_user_handle_align)
144+
_ASM_EXTABLE_CPY(101b, .Lcopy_user_handle_align)
145+
.endm
199146

200-
_ASM_EXTABLE_CPY(18b, 40b)
201-
_ASM_EXTABLE_CPY(19b, 40b)
202-
_ASM_EXTABLE_CPY(21b, 50b)
203-
_ASM_EXTABLE_CPY(22b, 50b)
204-
SYM_CODE_END(copy_user_short_string)
205147

206148
/*
207149
* copy_user_nocache - Uncached memory copy with exception handling
@@ -346,5 +288,40 @@ SYM_FUNC_START(__copy_user_nocache)
346288
_ASM_EXTABLE_CPY(31b, .L_fixup_4b_copy)
347289
_ASM_EXTABLE_CPY(40b, .L_fixup_1b_copy)
348290
_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
291+
292+
/*
293+
* Try to copy last bytes and clear the rest if needed.
294+
* Since protection fault in copy_from/to_user is not a normal situation,
295+
* it is not necessary to optimize tail handling.
296+
* Don't try to copy the tail if machine check happened
297+
*
298+
* Input:
299+
* eax trap number written by ex_handler_copy()
300+
* rdi destination
301+
* rsi source
302+
* rdx count
303+
*
304+
* Output:
305+
* eax uncopied bytes or 0 if successful.
306+
*/
307+
.Lcopy_user_handle_tail:
308+
cmp $X86_TRAP_MC,%eax
309+
je 3f
310+
311+
movl %edx,%ecx
312+
1: rep movsb
313+
2: mov %ecx,%eax
314+
RET
315+
316+
3:
317+
movl %edx,%eax
318+
RET
319+
320+
_ASM_EXTABLE_CPY(1b, 2b)
321+
322+
.Lcopy_user_handle_align:
323+
addl %ecx,%edx /* ecx is zerorest also */
324+
jmp .Lcopy_user_handle_tail
325+
349326
SYM_FUNC_END(__copy_user_nocache)
350327
EXPORT_SYMBOL(__copy_user_nocache)

tools/objtool/check.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1285,7 +1285,7 @@ static const char *uaccess_safe_builtin[] = {
12851285
"copy_mc_enhanced_fast_string",
12861286
"ftrace_likely_update", /* CONFIG_TRACE_BRANCH_PROFILING */
12871287
"rep_stos_alternative",
1288-
"copy_user_generic_unrolled",
1288+
"rep_movs_alternative",
12891289
"__copy_user_nocache",
12901290
NULL
12911291
};

0 commit comments

Comments
 (0)