Skip to content

Commit fa69714

Browse files
Dominik BrodowskiIngo Molnar
authored andcommitted
syscalls/x86: Use 'struct pt_regs' based syscall calling convention for 64-bit syscalls
Let's make use of ARCH_HAS_SYSCALL_WRAPPER=y on pure 64-bit x86-64 systems: Each syscall defines a stub which takes struct pt_regs as its only argument. It decodes just those parameters it needs, e.g: asmlinkage long sys_xyzzy(const struct pt_regs *regs) { return SyS_xyzzy(regs->di, regs->si, regs->dx); } This approach avoids leaking random user-provided register content down the call chain. For example, for sys_recv() which is a 4-parameter syscall, the assembly now is (in slightly reordered fashion): <sys_recv>: callq <__fentry__> /* decode regs->di, ->si, ->dx and ->r10 */ mov 0x70(%rdi),%rdi mov 0x68(%rdi),%rsi mov 0x60(%rdi),%rdx mov 0x38(%rdi),%rcx [ SyS_recv() is automatically inlined by the compiler, as it is not [yet] used anywhere else ] /* clear %r9 and %r8, the 5th and 6th args */ xor %r9d,%r9d xor %r8d,%r8d /* do the actual work */ callq __sys_recvfrom /* cleanup and return */ cltq retq The only valid place in an x86-64 kernel which rightfully calls a syscall function on its own -- vsyscall -- needs to be modified to pass struct pt_regs onwards as well. To keep the syscall table generation working independent of SYSCALL_PTREGS being enabled, the stubs are named the same as the "original" syscall stubs, i.e. sys_*(). This patch is based on an original proof-of-concept | From: Linus Torvalds <torvalds@linux-foundation.org> | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> and was split up and heavily modified by me, in particular to base it on ARCH_HAS_SYSCALL_WRAPPER, to limit it to 64-bit-only for the time being, and to update the vsyscall to the new calling convention. Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Borislav Petkov <bp@alien8.de> Cc: Brian Gerst <brgerst@gmail.com> Cc: Denys Vlasenko <dvlasenk@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Josh Poimboeuf <jpoimboe@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Link: http://lkml.kernel.org/r/20180405095307.3730-4-linux@dominikbrodowski.net Signed-off-by: Ingo Molnar <mingo@kernel.org>
1 parent 1bd21c6 commit fa69714

File tree

8 files changed

+120
-3
lines changed

8 files changed

+120
-3
lines changed

arch/x86/Kconfig

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2954,3 +2954,8 @@ source "crypto/Kconfig"
29542954
source "arch/x86/kvm/Kconfig"
29552955

29562956
source "lib/Kconfig"
2957+
2958+
config SYSCALL_PTREGS
2959+
def_bool y
2960+
depends on X86_64 && !COMPAT
2961+
select ARCH_HAS_SYSCALL_WRAPPER

arch/x86/entry/common.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,9 +284,13 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
284284
nr &= __SYSCALL_MASK;
285285
if (likely(nr < NR_syscalls)) {
286286
nr = array_index_nospec(nr, NR_syscalls);
287+
#ifdef CONFIG_SYSCALL_PTREGS
288+
regs->ax = sys_call_table[nr](regs);
289+
#else
287290
regs->ax = sys_call_table[nr](
288291
regs->di, regs->si, regs->dx,
289292
regs->r10, regs->r8, regs->r9);
293+
#endif
290294
}
291295

292296
syscall_return_slowpath(regs);

arch/x86/entry/syscall_64.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,19 @@
77
#include <asm/asm-offsets.h>
88
#include <asm/syscall.h>
99

10+
#ifdef CONFIG_SYSCALL_PTREGS
11+
/* this is a lie, but it does not hurt as sys_ni_syscall just returns -EINVAL */
12+
extern asmlinkage long sys_ni_syscall(const struct pt_regs *);
13+
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(const struct pt_regs *);
14+
#else /* CONFIG_SYSCALL_PTREGS */
15+
extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
1016
#define __SYSCALL_64(nr, sym, qual) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
17+
#endif /* CONFIG_SYSCALL_PTREGS */
1118
#include <asm/syscalls_64.h>
1219
#undef __SYSCALL_64
1320

1421
#define __SYSCALL_64(nr, sym, qual) [nr] = sym,
1522

16-
extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
17-
1823
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
1924
/*
2025
* Smells like a compiler bug -- it doesn't work

arch/x86/entry/vsyscall/vsyscall_64.c

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,9 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
127127
int vsyscall_nr, syscall_nr, tmp;
128128
int prev_sig_on_uaccess_err;
129129
long ret;
130+
#ifdef CONFIG_SYSCALL_PTREGS
131+
unsigned long orig_dx;
132+
#endif
130133

131134
/*
132135
* No point in checking CS -- the only way to get here is a user mode
@@ -227,19 +230,38 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
227230
ret = -EFAULT;
228231
switch (vsyscall_nr) {
229232
case 0:
233+
#ifdef CONFIG_SYSCALL_PTREGS
234+
/* this decodes regs->di and regs->si on its own */
235+
ret = sys_gettimeofday(regs);
236+
#else
230237
ret = sys_gettimeofday(
231238
(struct timeval __user *)regs->di,
232239
(struct timezone __user *)regs->si);
240+
#endif /* CONFIG_SYSCALL_PTREGS */
233241
break;
234242

235243
case 1:
244+
#ifdef CONFIG_SYSCALL_PTREGS
245+
/* this decodes regs->di on its own */
246+
ret = sys_time(regs);
247+
#else
236248
ret = sys_time((time_t __user *)regs->di);
249+
#endif /* CONFIG_SYSCALL_PTREGS */
237250
break;
238251

239252
case 2:
253+
#ifdef CONFIG_SYSCALL_PTREGS
254+
/* while we could clobber regs->dx, we didn't in the past... */
255+
orig_dx = regs->dx;
256+
regs->dx = 0;
257+
/* this decodes regs->di, regs->si and regs->dx on its own */
258+
ret = sys_getcpu(regs);
259+
regs->dx = orig_dx;
260+
#else
240261
ret = sys_getcpu((unsigned __user *)regs->di,
241262
(unsigned __user *)regs->si,
242263
NULL);
264+
#endif /* CONFIG_SYSCALL_PTREGS */
243265
break;
244266
}
245267

arch/x86/include/asm/syscall.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,13 @@
2020
#include <asm/thread_info.h> /* for TS_COMPAT */
2121
#include <asm/unistd.h>
2222

23+
#ifdef CONFIG_SYSCALL_PTREGS
24+
typedef asmlinkage long (*sys_call_ptr_t)(const struct pt_regs *);
25+
#else
2326
typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long,
2427
unsigned long, unsigned long,
2528
unsigned long, unsigned long);
29+
#endif /* CONFIG_SYSCALL_PTREGS */
2630
extern const sys_call_ptr_t sys_call_table[];
2731

2832
#if defined(CONFIG_X86_32)
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/* SPDX-License-Identifier: GPL-2.0 */
2+
/*
3+
* syscall_wrapper.h - x86 specific wrappers to syscall definitions
4+
*/
5+
6+
#ifndef _ASM_X86_SYSCALL_WRAPPER_H
7+
#define _ASM_X86_SYSCALL_WRAPPER_H
8+
9+
/*
10+
* Instead of the generic __SYSCALL_DEFINEx() definition, this macro takes
11+
* struct pt_regs *regs as the only argument of the syscall stub named
12+
* sys_*(). It decodes just the registers it needs and passes them on to
13+
* the SyS_*() wrapper and then to the SYSC_*() function doing the actual job.
14+
* These wrappers and functions are inlined, meaning that the assembly looks
15+
* as follows (slightly re-ordered):
16+
*
17+
* <sys_recv>: <-- syscall with 4 parameters
18+
* callq <__fentry__>
19+
*
20+
* mov 0x70(%rdi),%rdi <-- decode regs->di
21+
* mov 0x68(%rdi),%rsi <-- decode regs->si
22+
* mov 0x60(%rdi),%rdx <-- decode regs->dx
23+
* mov 0x38(%rdi),%rcx <-- decode regs->r10
24+
*
25+
* xor %r9d,%r9d <-- clear %r9
26+
* xor %r8d,%r8d <-- clear %r8
27+
*
28+
* callq __sys_recvfrom <-- do the actual work in __sys_recvfrom()
29+
* which takes 6 arguments
30+
*
31+
* cltq <-- extend return value to 64-bit
32+
* retq <-- return
33+
*
34+
* This approach avoids leaking random user-provided register content down
35+
* the call chain.
36+
*
37+
* As the generic SYSCALL_DEFINE0() macro does not decode any parameters for
38+
* obvious reasons, and passing struct pt_regs *regs to it in %rdi does not
39+
* hurt, there is no need to override it.
40+
*/
41+
#define __SYSCALL_DEFINEx(x, name, ...) \
42+
asmlinkage long sys##name(const struct pt_regs *regs); \
43+
ALLOW_ERROR_INJECTION(sys##name, ERRNO); \
44+
static long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
45+
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
46+
asmlinkage long sys##name(const struct pt_regs *regs) \
47+
{ \
48+
return SyS##name(__MAP(x,__SC_ARGS \
49+
,,regs->di,,regs->si,,regs->dx \
50+
,,regs->r10,,regs->r8,,regs->r9)); \
51+
} \
52+
static long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
53+
{ \
54+
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
55+
__MAP(x,__SC_TEST,__VA_ARGS__); \
56+
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
57+
return ret; \
58+
} \
59+
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
60+
61+
/*
62+
* For VSYSCALLS, we need to declare these three syscalls with the new
63+
* pt_regs-based calling convention for in-kernel use.
64+
*/
65+
struct pt_regs;
66+
asmlinkage long sys_getcpu(const struct pt_regs *regs); /* di,si,dx */
67+
asmlinkage long sys_gettimeofday(const struct pt_regs *regs); /* di,si */
68+
asmlinkage long sys_time(const struct pt_regs *regs); /* di */
69+
70+
#endif /* _ASM_X86_SYSCALL_WRAPPER_H */

arch/x86/include/asm/syscalls.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,12 @@
1818
/* Common in X86_32 and X86_64 */
1919
/* kernel/ioport.c */
2020
long ksys_ioperm(unsigned long from, unsigned long num, int turn_on);
21+
22+
#ifndef CONFIG_SYSCALL_PTREGS
23+
/*
24+
* If CONFIG_SYSCALL_PTREGS is enabled, a different syscall calling convention
25+
* is used. Do not include these -- invalid -- prototypes then
26+
*/
2127
asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
2228
asmlinkage long sys_iopl(unsigned int);
2329

@@ -53,4 +59,5 @@ asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
5359
unsigned long, unsigned long, unsigned long);
5460

5561
#endif /* CONFIG_X86_32 */
62+
#endif /* CONFIG_SYSCALL_PTREGS */
5663
#endif /* _ASM_X86_SYSCALLS_H */

include/linux/syscalls.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ union bpf_attr;
102102
* for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n>
103103
*/
104104
#define __MAP0(m,...)
105-
#define __MAP1(m,t,a) m(t,a)
105+
#define __MAP1(m,t,a,...) m(t,a)
106106
#define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
107107
#define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
108108
#define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)

0 commit comments

Comments
 (0)