From 4a5ce3e99388cd7306ba5be0f61992bf8fbfba36 Mon Sep 17 00:00:00 2001 From: Dmitrii Kuvaiskii Date: Wed, 24 Apr 2024 06:00:58 -0700 Subject: [PATCH] [LibOS] Add `sys.mock_syscalls = [ ... ]` manifest option This commit adds the manifest syntax `sys.mock_syscalls = [ ... ]` to specify system calls that will be mocked when executed in Gramine (i.e. return a specified value without any other side effects). This may be particularly important for cases where the overhead of invoking a system call on the host (e.g. exiting the SGX enclave) becomes a performance bottleneck, and it is more beneficial to disable or no-op the syscall in the first place; `sched_yield()` is an example. Another example may be disabling certain functionalities for security reasons. For example, one may want to disable `eventfd()` and `eventfd2()` to forbid creation of eventfd objects. Yet another example may be mocking syscalls currently not implemented in Gramine. E.g. it may be enough to mock `vhangup()` to always return 0, so that the workload proceeds further. Signed-off-by: Dmitrii Kuvaiskii --- Documentation/manifest-syntax.rst | 42 ++++++++++ libos/include/libos_internal.h | 3 + libos/include/libos_table.h | 7 +- libos/src/arch/x86_64/libos_table.c | 3 + libos/src/libos_init.c | 1 + libos/src/libos_parser.c | 20 +++++ libos/src/libos_syscalls.c | 80 ++++++++++++++++++- libos/test/regression/meson.build | 1 + libos/test/regression/mock_syscalls.c | 60 ++++++++++++++ .../mock_syscalls.manifest.template | 41 ++++++++++ libos/test/regression/test_libos.py | 11 +++ libos/test/regression/tests.toml | 5 +- libos/test/regression/tests_musl.toml | 3 +- python/graminelibos/manifest_check.py | 2 + 14 files changed, 274 insertions(+), 5 deletions(-) create mode 100644 libos/test/regression/mock_syscalls.c create mode 100644 libos/test/regression/mock_syscalls.manifest.template diff --git a/Documentation/manifest-syntax.rst b/Documentation/manifest-syntax.rst index b96fa1892f..cab4b8949e 100644 --- a/Documentation/manifest-syntax.rst +++ b/Documentation/manifest-syntax.rst @@ -391,6 +391,48 @@ Python). Could be useful in SGX environments: child processes consume to achieve this, you need to run the whole Gramine inside a proper security sandbox. +Mocking syscalls +^^^^^^^^^^^^^^^^ + +:: + + sys.mock_syscalls = [ + { name = "syscall_name1", return = 0 }, # no-op syscall + { name = "syscall_name2", return = -38 }, # denied syscall (ENOSYS) + ] + +This syntax specifies the system calls that are mocked when executed in +Gramine (i.e. they return a specified value without any other side effects). +If ``return`` field is skipped, then the default value is ``0`` (no-op). + +For example, to skip ``sched_yield`` syscall, specify:: + + sys.mock_syscalls = [ + { name = "sched_yield", return = 0 }, + ] + +As another example, to disallow eventfd completely, specify:: + + sys.mock_syscalls = [ + { name = "eventfd", return = -38 }, + { name = "eventfd", return = -38 }, + ] + + +.. note :: + This option is *not* a replacement for ``sys.disallow_subprocesses`` (see + above). This is because the ``clone()`` syscall has two usages: (1) it is + used to spawn subprocesses by Glibc and many other libraries and runtimes and + (2) it is also used to create threads in the same process. The + ``sys.disallow_subprocesses`` manifest option disables only the first usage, + whereas ``sys.mock_syscalls = [ name = "clone", ...]`` disables both usages. + +.. note :: + This option is *not* a security feature. Its rationale is improving + performance (the example of ``sched_yield``), mocking syscalls currently not + implemented in Gramine, and limiting syscalls exposed to the app. + + Root FS mount point ^^^^^^^^^^^^^^^^^^^ diff --git a/libos/include/libos_internal.h b/libos/include/libos_internal.h index cb108441b2..6a26234523 100644 --- a/libos/include/libos_internal.h +++ b/libos/include/libos_internal.h @@ -155,8 +155,11 @@ extern bool g_eventfd_passthrough_mode; int init_eventfd_mode(void); void warn_unsupported_syscall(unsigned long sysno); +void trace_mock_syscall(unsigned long sysno); void debug_print_syscall_before(unsigned long sysno, ...); void debug_print_syscall_after(unsigned long sysno, ...); +int get_syscall_number(const char* name, unsigned long* out_sysno); +int init_syscalls(void); #ifndef __alloca #define __alloca __builtin_alloca diff --git a/libos/include/libos_table.h b/libos/include/libos_table.h index aa6aaf25c7..b2eb646526 100644 --- a/libos/include/libos_table.h +++ b/libos/include/libos_table.h @@ -12,9 +12,14 @@ #include "linux_abi/sysinfo.h" typedef void (*libos_syscall_t)(void); - extern libos_syscall_t libos_syscall_table[]; +struct libos_mock_syscall { + bool is_mocked; + long return_value; +}; +extern struct libos_mock_syscall libos_mock_syscall_table[]; + /* syscall implementation */ long libos_syscall_read(int fd, void* buf, size_t count); long libos_syscall_write(int fd, const void* buf, size_t count); diff --git a/libos/src/arch/x86_64/libos_table.c b/libos/src/arch/x86_64/libos_table.c index 86147ec29e..615da2429e 100644 --- a/libos/src/arch/x86_64/libos_table.c +++ b/libos/src/arch/x86_64/libos_table.c @@ -377,3 +377,6 @@ libos_syscall_t libos_syscall_table[LIBOS_SYSCALL_BOUND] = { [__NR_futex_waitv] = (libos_syscall_t)0, // libos_syscall_futex_waitv [__NR_set_mempolicy_home_node] = (libos_syscall_t)0, // libos_syscall_set_mempolicy_home_node }; + +/* by default, all syscalls have `is_mocked = false` and `return_value = 0` */ +struct libos_mock_syscall libos_mock_syscall_table[LIBOS_SYSCALL_BOUND] = { 0 }; diff --git a/libos/src/libos_init.c b/libos/src/libos_init.c index 8d26d4b5ed..faa07e4cbe 100644 --- a/libos/src/libos_init.c +++ b/libos/src/libos_init.c @@ -503,6 +503,7 @@ noreturn void libos_init(const char* const* argv, const char* const* envp) { strlen(g_pal_public_state->dns_host.hostname)); RUN_INIT(init_eventfd_mode); + RUN_INIT(init_syscalls); log_debug("LibOS initialized"); diff --git a/libos/src/libos_parser.c b/libos/src/libos_parser.c index e9de40c9cd..d1546e3c95 100644 --- a/libos/src/libos_parser.c +++ b/libos/src/libos_parser.c @@ -1649,6 +1649,26 @@ void warn_unsupported_syscall(unsigned long sysno) { log_warning("Unsupported system call %lu", sysno); } +void trace_mock_syscall(unsigned long sysno) { + log_trace("%s(...) = %ld (mock)", syscall_parser_table[sysno].name, + libos_mock_syscall_table[sysno].return_value); +} + +int get_syscall_number(const char* name, unsigned long* out_sysno) { + static_assert(LIBOS_SYSCALL_BOUND == ARRAY_SIZE(syscall_parser_table), "oops"); + assert(out_sysno); + + for (size_t i = 0; i < LIBOS_SYSCALL_BOUND; i++) { + if (!syscall_parser_table[i].name) + continue; + if (strcmp(name, syscall_parser_table[i].name) == 0) { + *out_sysno = i; + return 0; + } + } + return -ENOSYS; +} + static int buf_write_all(const char* str, size_t size, void* arg) { __UNUSED(arg); diff --git a/libos/src/libos_syscalls.c b/libos/src/libos_syscalls.c index 83be137621..3093cf541c 100644 --- a/libos/src/libos_syscalls.c +++ b/libos/src/libos_syscalls.c @@ -11,7 +11,9 @@ #include "libos_table.h" #include "libos_tcb.h" #include "libos_thread.h" +#include "libos_utils.h" #include "linux_abi/errors.h" +#include "toml_utils.h" typedef arch_syscall_arg_t (*six_args_syscall_t)(arch_syscall_arg_t, arch_syscall_arg_t, arch_syscall_arg_t, arch_syscall_arg_t, @@ -31,7 +33,19 @@ noreturn void libos_emulate_syscall(PAL_CONTEXT* context) { unsigned long args[] = { ALL_SYSCALL_ARGS(context) }; ret = handle_libos_call(args[0], args[1], args[2]); } else { - if (sysnr >= LIBOS_SYSCALL_BOUND || !libos_syscall_table[sysnr]) { + if (sysnr >= LIBOS_SYSCALL_BOUND) { + warn_unsupported_syscall(sysnr); + ret = -ENOSYS; + goto out; + } + + if (libos_mock_syscall_table[sysnr].is_mocked) { + trace_mock_syscall(sysnr); + ret = libos_mock_syscall_table[sysnr].return_value; + goto out; + } + + if (!libos_syscall_table[sysnr]) { warn_unsupported_syscall(sysnr); ret = -ENOSYS; goto out; @@ -84,3 +98,67 @@ noreturn void return_from_syscall(PAL_CONTEXT* context) { #endif _return_from_syscall(context); } + +int init_syscalls(void) { + assert(g_manifest_root); + int ret; + + toml_table_t* manifest_sys = toml_table_in(g_manifest_root, "sys"); + if (!manifest_sys) + return 0; + + toml_array_t* toml_mock_syscalls = toml_array_in(manifest_sys, "mock_syscalls"); + if (!toml_mock_syscalls) + return 0; + + ssize_t toml_mock_syscalls_cnt = toml_array_nelem(toml_mock_syscalls); + if (toml_mock_syscalls_cnt < 0) + return -EPERM; + if (toml_mock_syscalls_cnt == 0) + return 0; + + char* syscall_name = NULL; + + for (ssize_t i = 0; i < toml_mock_syscalls_cnt; i++) { + toml_table_t* toml_mock_syscall = toml_table_at(toml_mock_syscalls, i); + if (!toml_mock_syscall) { + log_error("Invalid mock syscall in manifest at index %ld (not a TOML table)", i); + ret = -EINVAL; + goto out; + } + + ret = toml_string_in(toml_mock_syscall, "name", &syscall_name); + if (ret < 0) { + log_error("Invalid mock syscall in manifest at index %ld (can't parse `name`)", i); + ret = -EINVAL; + goto out; + } + + int64_t syscall_return; + ret = toml_int_in(toml_mock_syscall, "return", /*defaultval=*/0, &syscall_return); + if (ret < 0) { + log_error("Invalid mock syscall in manifest at index %ld (can't parse `return`)", i); + ret = -EINVAL; + goto out; + } + + uint64_t sysno; + ret = get_syscall_number(syscall_name, &sysno); + if (ret < 0) { + log_error("Unrecognized mock syscall `%s` in manifest at index %ld", syscall_name, i); + goto out; + } + + /* add syscall to the table of mocked syscalls */ + libos_mock_syscall_table[sysno].is_mocked = true; + libos_mock_syscall_table[sysno].return_value = syscall_return; + + free(syscall_name); + syscall_name = NULL; + } + + ret = 0; +out: + free(syscall_name); + return ret; +} diff --git a/libos/test/regression/meson.build b/libos/test/regression/meson.build index 48b9f54a05..d62e28bed2 100644 --- a/libos/test/regression/meson.build +++ b/libos/test/regression/meson.build @@ -72,6 +72,7 @@ tests = { 'mmap_file': {}, 'mmap_file_backed': {}, 'mmap_file_emulated': {}, + 'mock_syscalls': {}, 'mprotect_file_fork': {}, 'mprotect_prot_growsdown': {}, 'multi_pthread': {}, diff --git a/libos/test/regression/mock_syscalls.c b/libos/test/regression/mock_syscalls.c new file mode 100644 index 0000000000..52d0b7f256 --- /dev/null +++ b/libos/test/regression/mock_syscalls.c @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: LGPL-3.0-or-later */ +/* Copyright (C) 2024 Intel Corporation + * Dmitrii Kuvaiskii + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int main(void) { + int ret; + + errno = 0; + ret = eventfd(0, 0); + if (ret != -1 && errno != ENOSYS) + errx(1, "expected eventfd to fail with -ENOSYS but it returned ret=%d errno=%d", ret, + errno); + + errno = 0; + ret = fork(); + if (ret != -1 && errno != ENOSYS) + errx(1, "expected fork to fail with -ENOSYS but it returned ret=%d errno=%d", ret, errno); + + errno = 0; + ret = getpid(); + if (ret < 0) + errx(1, "expected getpid to succeed but it returned ret=%d errno=%d", ret, errno); + + errno = 0; + ret = getppid(); + if (ret < 0) + errx(1, "expected getppid to succeed but it returned ret=%d errno=%d", ret, errno); + + /* sched_yield must *not* appear in strace on the host; this case is added for manual testing */ + for (int i = 0; i < 100; i++) { + errno = 0; + ret = sched_yield(); + if (ret < 0) { + errx(1, "expected sched_yield to succeed (no-op) but it returned ret=%d errno=%d", + ret, errno); + } + } + + /* vhangup was chosen as a syscall that will most certainly not be implemented in Gramine */ + errno = 0; + ret = vhangup(); + if (ret != 123) + errx(1, "expected vhangup to succeed (as a no-op, with dummy return value 123) but it " + "returned ret=%d errno=%d", ret, errno); + + puts("TEST OK"); + return 0; +} diff --git a/libos/test/regression/mock_syscalls.manifest.template b/libos/test/regression/mock_syscalls.manifest.template new file mode 100644 index 0000000000..34fa89fd71 --- /dev/null +++ b/libos/test/regression/mock_syscalls.manifest.template @@ -0,0 +1,41 @@ +loader.entrypoint = "file:{{ gramine.libos }}" +libos.entrypoint = "{{ entrypoint }}" + +loader.log_level = "trace" + +loader.env.LD_LIBRARY_PATH = "/lib" + +fs.mounts = [ + { path = "/lib", uri = "file:{{ gramine.runtimedir(libc) }}" }, + { path = "/{{ entrypoint }}", uri = "file:{{ binary_dir }}/{{ entrypoint }}" }, +] + +sys.mock_syscalls = [ + # sched_yield is mocked as no-op (`return = 0` by default), sometimes useful for performance; + # this no-op behavior should be evident from strace on the host + { name = "sched_yield" }, + + # vhangup is not implemented in Gramine but here mocked as no-op with a dummy return value + { name = "vhangup", return = 123 }, + + # even though glibc wrapper is called eventfd, glibc translates it into eventfd2; + # we specify both syscall variants to be on the safe side + { name = "eventfd", return = -38 }, + { name = "eventfd2", return = -38 }, + + # even though glibc wrapper is called fork, glibc translates it into clone; at the same time, musl + # uses fork syscall; we specify all syscall variants to be on the safe side + { name = "fork", return = -38 }, + { name = "vfork", return = -38 }, + { name = "clone", return = -38 }, + { name = "clone3", return = -38 }, +] + +sgx.debug = true +sgx.edmm_enable = {{ 'true' if env.get('EDMM', '0') == '1' else 'false' }} + +sgx.trusted_files = [ + "file:{{ gramine.libos }}", + "file:{{ gramine.runtimedir(libc) }}/", + "file:{{ binary_dir }}/{{ entrypoint }}", +] diff --git a/libos/test/regression/test_libos.py b/libos/test/regression/test_libos.py index b6d771c044..3af32141df 100644 --- a/libos/test/regression/test_libos.py +++ b/libos/test/regression/test_libos.py @@ -1030,6 +1030,17 @@ def test_010_syscall_restart(self): self.assertIn('Got: P', stdout) self.assertIn('TEST 2 OK', stdout) + def test_020_mock_syscalls(self): + stdout, stderr = self.run_binary(['mock_syscalls']) + self.assertIn('eventfd2(...) = -38 (mock)', stderr) + if USES_MUSL: + self.assertIn('fork(...) = -38 (mock)', stderr) + else: + self.assertIn('clone(...) = -38 (mock)', stderr) + self.assertIn('sched_yield(...) = 0 (mock)', stderr) + self.assertIn('vhangup(...) = 123 (mock)', stderr) + self.assertIn('TEST OK', stdout) + class TC_40_FileSystem(RegressionTestCase): def test_000_proc(self): stdout, _ = self.run_binary(['proc_common']) diff --git a/libos/test/regression/tests.toml b/libos/test/regression/tests.toml index 9b68f74ee0..c6c87165a6 100644 --- a/libos/test/regression/tests.toml +++ b/libos/test/regression/tests.toml @@ -60,9 +60,9 @@ manifests = [ "gettimeofday", "groups", "helloworld", + "host_root_fs", "hostname", "hostname_extra_runtime_conf", - "host_root_fs", "init_fail", "itimer", "keys", @@ -75,6 +75,7 @@ manifests = [ "mmap_file", "mmap_file_backed", "mmap_file_emulated", + "mock_syscalls", "mprotect_file_fork", "mprotect_prot_growsdown", "multi_pthread", @@ -105,8 +106,8 @@ manifests = [ "sealed_file_mod", "select", "send_handle", - "shared_object", "shadow_pseudo_fs", + "shared_object", "shebang_test_script", "shm", "sid", diff --git a/libos/test/regression/tests_musl.toml b/libos/test/regression/tests_musl.toml index b553efa678..22bc952f7f 100644 --- a/libos/test/regression/tests_musl.toml +++ b/libos/test/regression/tests_musl.toml @@ -62,9 +62,9 @@ manifests = [ "gettimeofday", "groups", "helloworld", + "host_root_fs", "hostname", "hostname_extra_runtime_conf", - "host_root_fs", "init_fail", "itimer", "keys", @@ -77,6 +77,7 @@ manifests = [ "mmap_file", "mmap_file_backed", "mmap_file_emulated", + "mock_syscalls", "mprotect_file_fork", "mprotect_prot_growsdown", "multi_pthread", diff --git a/python/graminelibos/manifest_check.py b/python/graminelibos/manifest_check.py index 1f0b9ea6a6..c703fb3053 100644 --- a/python/graminelibos/manifest_check.py +++ b/python/graminelibos/manifest_check.py @@ -126,6 +126,8 @@ # not validated. 'ioctl_structs': {str: object}, + 'mock_syscalls': [{Required('name'): str, 'return': int}], + 'stack': {'size': _size}, }, })