Skip to content

Commit

Permalink
[LibOS] Do not perform lock(&hdl->pos_lock) on non-seekable handles
Browse files Browse the repository at this point in the history
Handles can be file-like (allow seeking and thus have a file position)
and stream-like (do not allow seeking and thus file position field `pos`
is unused). Previously, LibOS emulations of `read()`, `write()`,
`sendfile()`, etc. did not distinguish between file-like and stream-like
handles and always acquired `hdl->pos_lock`.

This led to a bug on stream-like handles, in particular, on eventfd
handles: `read()` and `write()` operations are blocking for eventfds,
thus a blocking `read()` would acquire the `hdl->pos_lock` indefinitely
and e.g. hang another thread that performs `write()` on same eventfd.

This commit fixes this bug by skipping this lock on stream-like handles.
All handles now have a new field `seekable`, and stream-like handles
have it as `false` and file-like ones have it as `true`. This has an
additional performance benefit: stream-like handles like pipes, sockets
and eventfds don't spend cycles in acquiring and releasing the lock
inside the relevant (and very frequent) syscalls.

New LibOS regression test `eventfd_read_then_write` is added to test
this bug fix.

Signed-off-by: Dmitrii Kuvaiskii <dmitrii.kuvaiskii@intel.com>
  • Loading branch information
dimakuv committed Apr 3, 2024
1 parent ff06284 commit ec06c32
Show file tree
Hide file tree
Showing 17 changed files with 159 additions and 27 deletions.
14 changes: 12 additions & 2 deletions libos/include/libos_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,12 @@ struct libos_handle {
*/
struct libos_inode* inode;

/* Offset in file. Protected by `pos_lock`. */
/* Whether the handle is seekable (based on `type`). This field does not change, so reading it
* does not require holding any locks. Affects the `pos` field, see below. */
bool seekable;

/* Offset in file. Protected by `pos_lock`. Only meaningful for `seekable` handles; always zero
* for non-seekable handles. */
file_off_t pos;

/* This list contains `libos_epoll_item` objects this handle is part of. All accesses should be
Expand Down Expand Up @@ -221,7 +226,9 @@ struct libos_handle {

/* Lock for handle position (`pos`). Intended for operations that change the position (e.g.
* `read`, `seek` but not `pread`). This lock should be taken *before* `libos_handle.lock` and
* `libos_inode.lock`. */
* `libos_inode.lock`. Must be used *only* via maybe_lock_pos_handle() and
* maybe_unlock_pos_handle(); these functions make sure that the lock is acquired only on those
* handle types that are seekable (e.g. not on eventfds or pipes). */
struct libos_lock pos_lock;
};

Expand Down Expand Up @@ -295,3 +302,6 @@ int get_file_size(struct libos_handle* file, uint64_t* size);

ssize_t do_handle_read(struct libos_handle* hdl, void* buf, size_t count);
ssize_t do_handle_write(struct libos_handle* hdl, const void* buf, size_t count);

void maybe_lock_pos_handle(struct libos_handle* hdl);
void maybe_unlock_pos_handle(struct libos_handle* hdl);
10 changes: 10 additions & 0 deletions libos/src/bookkeep/libos_handle.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@ static MEM_MGR handle_mgr = NULL;

#define INIT_HANDLE_MAP_SIZE 32

void maybe_lock_pos_handle(struct libos_handle* hdl) {
if (hdl->seekable)
lock(&hdl->pos_lock);
}

void maybe_unlock_pos_handle(struct libos_handle* hdl) {
if (hdl->seekable)
unlock(&hdl->pos_lock);
}

int open_executable(struct libos_handle* hdl, const char* path) {
struct libos_dentry* dent = NULL;

Expand Down
2 changes: 2 additions & 0 deletions libos/src/fs/chroot/encrypted.c
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ static int chroot_encrypted_open(struct libos_handle* hdl, struct libos_dentry*
hdl->inode = dent->inode;
get_inode(dent->inode);
hdl->type = TYPE_CHROOT_ENCRYPTED;
hdl->seekable = true;
hdl->pos = 0;
return 0;
}
Expand Down Expand Up @@ -232,6 +233,7 @@ static int chroot_encrypted_creat(struct libos_handle* hdl, struct libos_dentry*
hdl->inode = dent->inode;
get_inode(inode);
hdl->type = TYPE_CHROOT_ENCRYPTED;
hdl->seekable = true;
hdl->pos = 0;
ret = 0;
out:
Expand Down
1 change: 1 addition & 0 deletions libos/src/fs/chroot/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,7 @@ static int chroot_do_open(struct libos_handle* hdl, struct libos_dentry* dent, m
uri = NULL;

hdl->type = TYPE_CHROOT;
hdl->seekable = true;
hdl->pos = 0;
hdl->pal_handle = palhdl;
} else {
Expand Down
5 changes: 3 additions & 2 deletions libos/src/fs/libos_fs_pseudo.c
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,16 @@ static int pseudo_open(struct libos_handle* hdl, struct libos_dentry* dent, int
}

hdl->type = TYPE_STR;
hdl->seekable = true;
mem_file_init(&hdl->info.str.mem, str, len);
hdl->pos = 0;
break;
}

case PSEUDO_DEV: {
hdl->type = TYPE_DEV;
hdl->seekable = true;
hdl->pos = 0;
if (node->dev.dev_ops.open) {
ret = node->dev.dev_ops.open(hdl, dent, flags);
if (ret < 0)
Expand Down Expand Up @@ -482,11 +485,9 @@ static int pseudo_poll(struct libos_handle* hdl, int events, int* out_events) {
switch (node->type) {
case PSEUDO_STR: {
assert(hdl->type == TYPE_STR);
lock(&hdl->pos_lock);
lock(&hdl->lock);
int ret = mem_file_poll(&hdl->info.str.mem, hdl->pos, events, out_events);
unlock(&hdl->lock);
unlock(&hdl->pos_lock);
return ret;
}

Expand Down
7 changes: 5 additions & 2 deletions libos/src/fs/libos_fs_util.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,10 @@ int generic_inode_hstat(struct libos_handle* hdl, struct stat* buf) {
file_off_t generic_inode_seek(struct libos_handle* hdl, file_off_t offset, int origin) {
file_off_t ret;

lock(&hdl->pos_lock);
if (!hdl->seekable)
return 0;

maybe_lock_pos_handle(hdl);
lock(&hdl->inode->lock);
file_off_t pos = hdl->pos;
file_off_t size = hdl->inode->size;
Expand All @@ -115,7 +118,7 @@ file_off_t generic_inode_seek(struct libos_handle* hdl, file_off_t offset, int o
ret = pos;
}
unlock(&hdl->inode->lock);
unlock(&hdl->pos_lock);
maybe_unlock_pos_handle(hdl);
return ret;
}

Expand Down
3 changes: 3 additions & 0 deletions libos/src/fs/shm/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ static int shm_do_open(struct libos_handle* hdl, struct libos_dentry* dent, mode
uri = NULL;

hdl->type = TYPE_SHM;
hdl->seekable = true;
hdl->pos = 0;

hdl->pal_handle = palhdl;
ret = 0;

Expand Down
1 change: 1 addition & 0 deletions libos/src/fs/tmpfs/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ static void tmpfs_do_open(struct libos_handle* hdl, struct libos_dentry* dent, i
__UNUSED(flags);

hdl->type = TYPE_TMPFS;
hdl->seekable = true;
hdl->pos = 0;
}

Expand Down
17 changes: 9 additions & 8 deletions libos/src/sys/libos_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,8 @@ long libos_syscall_sendfile(int out_fd, int in_fd, off_t* offset, size_t count)
* If `offset` is not NULL, we use `*offset` as starting offset for reading, and update
* `*offset` afterwards (and keep the offset in input handle unchanged).
*
* If `offset` is NULL, we use the offset in input handle, and update it afterwards.
* If `offset` is NULL, we use the offset in input handle, and update it afterwards (only if the
* input handle is seekable).
*/
file_off_t pos_in = 0;
if (offset) {
Expand All @@ -495,9 +496,9 @@ long libos_syscall_sendfile(int out_fd, int in_fd, off_t* offset, size_t count)
goto out;
}
} else {
lock(&in_hdl->pos_lock);
maybe_lock_pos_handle(in_hdl);
pos_in = in_hdl->pos;
unlock(&in_hdl->pos_lock);
maybe_unlock_pos_handle(in_hdl);
}

if (!(out_hdl->acc_mode & MAY_WRITE)) {
Expand All @@ -523,9 +524,9 @@ long libos_syscall_sendfile(int out_fd, int in_fd, off_t* offset, size_t count)
break;
}

lock(&out_hdl->pos_lock);
maybe_lock_pos_handle(out_hdl);
ssize_t y = out_hdl->fs->fs_ops->write(out_hdl, buf, x, &out_hdl->pos);
unlock(&out_hdl->pos_lock);
maybe_unlock_pos_handle(out_hdl);
if (y < 0) {
ret = y;
goto out_update;
Expand All @@ -550,10 +551,10 @@ long libos_syscall_sendfile(int out_fd, int in_fd, off_t* offset, size_t count)
* declaration). Note that we do it even if one of the read/write operations failed. */
if (offset) {
*offset = pos_in;
} else {
lock(&in_hdl->pos_lock);
} else if (in_hdl->seekable) {
maybe_lock_pos_handle(in_hdl);
in_hdl->pos = pos_in;
unlock(&in_hdl->pos_lock);
maybe_unlock_pos_handle(in_hdl);
}

out:
Expand Down
16 changes: 8 additions & 8 deletions libos/src/sys/libos_open.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ ssize_t do_handle_read(struct libos_handle* hdl, void* buf, size_t count) {
if (hdl->is_dir)
return -EISDIR;

lock(&hdl->pos_lock);
maybe_lock_pos_handle(hdl);
ssize_t ret = fs->fs_ops->read(hdl, buf, count, &hdl->pos);
unlock(&hdl->pos_lock);
maybe_unlock_pos_handle(hdl);
return ret;
}

Expand Down Expand Up @@ -70,9 +70,9 @@ ssize_t do_handle_write(struct libos_handle* hdl, const void* buf, size_t count)
if (hdl->is_dir)
return -EISDIR;

lock(&hdl->pos_lock);
maybe_lock_pos_handle(hdl);
ssize_t ret = fs->fs_ops->write(hdl, buf, count, &hdl->pos);
unlock(&hdl->pos_lock);
maybe_unlock_pos_handle(hdl);
return ret;
}

Expand Down Expand Up @@ -179,7 +179,7 @@ static file_off_t do_lseek_dir(struct libos_handle* hdl, off_t offset, int origi
assert(hdl->is_dir);

lock(&g_dcache_lock);
lock(&hdl->pos_lock);
maybe_lock_pos_handle(hdl);
lock(&hdl->lock);

file_off_t ret;
Expand All @@ -200,7 +200,7 @@ static file_off_t do_lseek_dir(struct libos_handle* hdl, off_t offset, int origi

out:
unlock(&hdl->lock);
unlock(&hdl->pos_lock);
maybe_unlock_pos_handle(hdl);
unlock(&g_dcache_lock);
return ret;
}
Expand Down Expand Up @@ -360,7 +360,7 @@ static ssize_t do_getdents(int fd, uint8_t* buf, size_t buf_size, bool is_getden
}

lock(&g_dcache_lock);
lock(&hdl->pos_lock);
maybe_lock_pos_handle(hdl);
lock(&hdl->lock);

struct libos_dir_handle* dirhdl = &hdl->dir_info;
Expand Down Expand Up @@ -447,7 +447,7 @@ static ssize_t do_getdents(int fd, uint8_t* buf, size_t buf_size, bool is_getden
ret = buf_pos;
out:
unlock(&hdl->lock);
unlock(&hdl->pos_lock);
maybe_unlock_pos_handle(hdl);
unlock(&g_dcache_lock);
out_no_unlock:
put_handle(hdl);
Expand Down
5 changes: 4 additions & 1 deletion libos/src/sys/libos_poll.c
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,8 @@ static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) {
*
* Some handles (e.g. files) do have their own, handle-specific poll callback. In such case
* we do not add these handles for PAL polling, but instead populate `revents` of a
* handle-corresponding FD with the result of the poll callback.
* handle-corresponding FD with the result of the poll callback. Note that we acquire the
* handle's `pos_lock` because the poll callback may access the `pos` field.
*
* Finally, some handles (e.g. tty/console) implement a dummy poll callback that returns
* "Function not implemented" (-ENOSYS) error. We have this special case because it is
Expand All @@ -128,7 +129,9 @@ static long do_poll(struct pollfd* fds, size_t fds_len, uint64_t* timeout_us) {
*/
bool handle_specific_poll_invoked = false;
if (handle->fs && handle->fs->fs_ops && handle->fs->fs_ops->poll) {
maybe_lock_pos_handle(handle);
ret = handle->fs->fs_ops->poll(handle, events, &events);
maybe_unlock_pos_handle(handle);
if (ret < 0 && ret != -ENOSYS) {
/* ENOSYS implies that no handle-specific poll was found; other errors imply that
* there was a handle-specific poll, but its invocation failed for other reasons */
Expand Down
8 changes: 4 additions & 4 deletions libos/src/sys/libos_wrappers.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ long libos_syscall_readv(unsigned long fd, struct iovec* vec, unsigned long vlen
if (!hdl)
return -EBADF;

lock(&hdl->pos_lock);
maybe_lock_pos_handle(hdl);

int ret = 0;

Expand Down Expand Up @@ -75,7 +75,7 @@ long libos_syscall_readv(unsigned long fd, struct iovec* vec, unsigned long vlen

ret = bytes;
out:
unlock(&hdl->pos_lock);
maybe_unlock_pos_handle(hdl);
put_handle(hdl);
if (ret == -EINTR) {
ret = -ERESTARTSYS;
Expand Down Expand Up @@ -103,7 +103,7 @@ long libos_syscall_writev(unsigned long fd, struct iovec* vec, unsigned long vle
if (!hdl)
return -EBADF;

lock(&hdl->pos_lock);
maybe_lock_pos_handle(hdl);

int ret = 0;

Expand Down Expand Up @@ -143,7 +143,7 @@ long libos_syscall_writev(unsigned long fd, struct iovec* vec, unsigned long vle

ret = bytes;
out:
unlock(&hdl->pos_lock);
maybe_unlock_pos_handle(hdl);
put_handle(hdl);
if (ret == -EINTR) {
ret = -ERESTARTSYS;
Expand Down
90 changes: 90 additions & 0 deletions libos/test/regression/eventfd_read_then_write.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/* This tests a bug in LibOS that resulted in a hang (due to locking) of Gramine */

#define _POSIX_C_SOURCE 200112 /* for nanosleep */

#include <pthread.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/eventfd.h>
#include <time.h>
#include <unistd.h>

#include "common.h"

#define TEST_TIMES 100 /* each iteration has a sleep for 10ms, so total test time is 1s */

static int g_efd;
static bool g_read_thread_ready = false;

static void pthread_check(int x) {
if (x) {
printf("pthread failed with %d (%s)\n", x, strerror(x));
exit(1);
}
}

static void write_thread_once(void) {
/* wait until the other thread starts its blocking read and reset the status */
while (true) {
bool expected = true;
if (__atomic_compare_exchange_n(&g_read_thread_ready, &expected, /*desired=*/false,
/*weak=*/0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) {
break;
}
}

/* unfortunately there's no way to figure out (without side effects) if the other thread called
* blocking read() already, so use a sleep-for-a-bit heuristic */
struct timespec ts = { .tv_nsec = 10 * 1000 * 1000 };
if (nanosleep(&ts, NULL) < 0) {
err(1, "nanosleep failed");
}

uint64_t val = 42;
if (write(g_efd, &val, sizeof(val)) != sizeof(val))
errx(1, "eventfd write failed");
}

static void read_thread_once(void) {
__atomic_store_n(&g_read_thread_ready, true, __ATOMIC_SEQ_CST);

/* blocking read */
uint64_t val;
if (read(g_efd, &val, sizeof(val)) != sizeof(val))
errx(1, "eventfd read failed");

/* blocking read was unblocked by a write from another thread */
if (val != 42)
errx(1, "unexpected value read (%lu)", val);
}

static void* write_thread(void* arg) {
for (int i = 0; i < TEST_TIMES; i++) {
write_thread_once();
}
return NULL;
}

static void* read_thread(void* arg) {
for (int i = 0; i < TEST_TIMES; i++) {
read_thread_once();
}
return NULL;
}

int main(void) {
g_efd = CHECK(eventfd(0, 0));

pthread_t th[2];
pthread_check(pthread_create(&th[0], NULL, read_thread, NULL));
pthread_check(pthread_create(&th[1], NULL, write_thread, NULL));

pthread_check(pthread_join(th[0], NULL));
pthread_check(pthread_join(th[1], NULL));

CHECK(close(g_efd));
puts("TEST OK");
return 0;
}

0 comments on commit ec06c32

Please sign in to comment.