Skip to content

Commit

Permalink
[LibOS] Return 0 on zero-sized read() syscalls on sockets
Browse files Browse the repository at this point in the history
On Linux, `read(socket-fd, buf, /*count=*/0)` has a special case of
returning zero, after all checks. Same applies for `readv()`. However,
this does *not* apply to `recvfrom()`, `recvmsg()`, `recvmmsg()`.

Linux code simply says "Match SYS5 behavior" in this corner case:
https://github.com/torvalds/linux/blob/99bd3cb0d12e85/net/socket.c#L1136-L1136

Apparently some applications/libraries rely on this behavior. Without
this corner case, these apps would hang (if the socket is blocking) or
unexpectedly return -EAGAIN (if the socket is non-blocking). Note that
the underlying PalSocketRecv() uses `recvmsg()` syscall, at least on
Linux-based PALs, so a simple fall-through to PAL would change the
semantics of `read()`/`readv()` issued by the app.

Signed-off-by: Dmitrii Kuvaiskii <dmitrii.kuvaiskii@intel.com>
  • Loading branch information
dimakuv committed Feb 26, 2024
1 parent a50192c commit 63ee38d
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 7 deletions.
2 changes: 1 addition & 1 deletion libos/include/libos_socket.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ extern struct libos_sock_ops sock_ip_ops;

ssize_t do_recvmsg(struct libos_handle* handle, struct iovec* iov, size_t iov_len,
void* msg_control, size_t* msg_controllen_ptr, void* addr, size_t* addrlen_ptr,
unsigned int* flags);
unsigned int* flags, bool emulate_recv_error_semantics);
ssize_t do_sendmsg(struct libos_handle* handle, struct iovec* iov, size_t iov_len,
void* msg_control, size_t msg_controllen, void* addr, size_t addrlen,
unsigned int flags);
6 changes: 4 additions & 2 deletions libos/src/fs/socket/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ static ssize_t read(struct libos_handle* handle, void* buf, size_t size, file_of
};
unsigned int flags = 0;
return do_recvmsg(handle, &iov, /*iov_len=*/1, /*msg_control=*/NULL,
/*msg_controllen_ptr=*/NULL, /*addr=*/NULL, /*addrlen_ptr=*/NULL, &flags);
/*msg_controllen_ptr=*/NULL, /*addr=*/NULL, /*addrlen_ptr=*/NULL, &flags,
/*emulate_recv_error_semantics=*/false);
}

static ssize_t write(struct libos_handle* handle, const void* buf, size_t size, file_off_t* pos) {
Expand All @@ -54,7 +55,8 @@ static ssize_t readv(struct libos_handle* handle, struct iovec* iov, size_t iov_
__UNUSED(pos);
unsigned int flags = 0;
return do_recvmsg(handle, iov, iov_len, /*msg_control=*/NULL, /*msg_controllen_ptr=*/NULL,
/*addr=*/NULL, /*addrlen_ptr=*/NULL, &flags);
/*addr=*/NULL, /*addrlen_ptr=*/NULL, &flags,
/*emulate_recv_error_semantics=*/false);
}

static ssize_t writev(struct libos_handle* handle, struct iovec* iov, size_t iov_len,
Expand Down
25 changes: 21 additions & 4 deletions libos/src/sys/libos_socket.c
Original file line number Diff line number Diff line change
Expand Up @@ -846,7 +846,7 @@ long libos_syscall_sendmmsg(int fd, struct mmsghdr* msg, unsigned int vlen, unsi
* is called directly from syscall handlers, which return values in such a way. */
ssize_t do_recvmsg(struct libos_handle* handle, struct iovec* iov, size_t iov_len,
void* msg_control, size_t* msg_controllen_ptr, void* addr, size_t* addrlen_ptr,
unsigned int* flags) {
unsigned int* flags, bool emulate_recv_error_semantics) {
ssize_t ret = 0;
if (handle->type != TYPE_SOCK) {
return -ENOTSOCK;
Expand Down Expand Up @@ -885,6 +885,22 @@ ssize_t do_recvmsg(struct libos_handle* handle, struct iovec* iov, size_t iov_le
total_size += iov[i].iov_len;
}

if (!total_size && !emulate_recv_error_semantics) {
/*
* In Linux, read() and readv() -- i.e. not recv*() syscalls -- have a "match SYS5 behavior"
* corner case: 0 is returned if the requested number of bytes to receive is 0. The
* rationale for this behavior is unclear and lost in history. The relevant Linux code:
* https://github.com/torvalds/linux/blob/99bd3cb0d12e85/net/socket.c#L1136-L1136
*
* Apparently some applications/libraries rely on this behavior. Without this corner case,
* these apps would hang (if the socket is blocking) or unexpectedly return -EAGAIN (if the
* socket is non-blocking). Note that the underlying PalSocketRecv() uses `recvmsg()`
* syscall, at least on Linux-based PALs, so a simple fall-through to PAL would change the
* semantics of `read()`/`readv()` issued by the app.
*/
return 0;
}

/*
* Taking this lock (and potentially blocking until other thread releases it) should be fine
* in most cases, regardless of whether this read is blocking or not. If it is blocking, then
Expand Down Expand Up @@ -1027,7 +1043,7 @@ long libos_syscall_recvfrom(int fd, void* buf, size_t len, unsigned int flags, v
.iov_len = len,
};
ssize_t ret = do_recvmsg(handle, &iov, 1, /*msg_control=*/NULL, /*msg_controllen_ptr=*/NULL,
addr, &addrlen, &flags);
addr, &addrlen, &flags, /*emulate_recv_error_semantics=*/true);
if (ret >= 0 && addr) {
*_addrlen = addrlen;
}
Expand All @@ -1048,7 +1064,7 @@ long libos_syscall_recvmsg(int fd, struct msghdr* msg, unsigned int flags) {

size_t addrlen = msg->msg_name ? msg->msg_namelen : 0;
ret = do_recvmsg(handle, msg->msg_iov, msg->msg_iovlen, msg->msg_control, &msg->msg_controllen,
msg->msg_name, &addrlen, &flags);
msg->msg_name, &addrlen, &flags, /*emulate_recv_error_semantics=*/true);
if (ret >= 0) {
if (msg->msg_name) {
msg->msg_namelen = addrlen;
Expand Down Expand Up @@ -1094,7 +1110,8 @@ long libos_syscall_recvmmsg(int fd, struct mmsghdr* msg, unsigned int vlen, unsi
size_t addrlen = hdr->msg_name ? hdr->msg_namelen : 0;
unsigned int this_flags = flags;
ret = do_recvmsg(handle, hdr->msg_iov, hdr->msg_iovlen, hdr->msg_control,
&hdr->msg_controllen, hdr->msg_name, &addrlen, &this_flags);
&hdr->msg_controllen, hdr->msg_name, &addrlen, &this_flags,
/*emulate_recv_error_semantics=*/true);
if (ret < 0) {
if (i == 0) {
/* Return error directly. */
Expand Down

0 comments on commit 63ee38d

Please sign in to comment.