Skip to content

Commit

Permalink
[LibOS] Update mmaped regions when writing to files
Browse files Browse the repository at this point in the history
Previously, Gramine didn't propagate changes to a mmaped file back to
memory. This is unexpected for apps that use both mmap and read/write on
the same file (e.g., etcd-io/bbolt has such implementation to write data
via `write()` and read it via the mmaped memory).

This commit introduces the reloading of affected mmaped regions on
`chroot_encrypted`, `chroot` and `tmpfs` file writes. To reduce
performance penalty in normal cases (where apps use either `write()` or
`mmap()`), this commit also creates a new field in the inode to record
the number of mmaped regions of this file. We hence simply check on this
field to determine whether to perform the reloading to make the overhead
negligible if none mmaped.

Signed-off-by: Kailun Qin <kailun.qin@intel.com>
  • Loading branch information
kailun-qin committed Nov 23, 2023
1 parent d052d18 commit 1ea3e60
Show file tree
Hide file tree
Showing 12 changed files with 345 additions and 9 deletions.
3 changes: 3 additions & 0 deletions libos/include/libos_fs.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,9 @@ struct libos_inode {
/* Filesystem-specific data */
void* data;

/* Number of VMAs the file is mmapped to; should be accessed using atomic operations. */
uint64_t num_mmapped;

struct libos_lock lock;
refcount_t ref_count;
};
Expand Down
3 changes: 3 additions & 0 deletions libos/include/libos_vma.h
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ int msync_range(uintptr_t begin, uintptr_t end);
/* Call `msync` for file mappings of `hdl` */
int msync_handle(struct libos_handle* hdl);

/* Reload file mappings of `hdl` */
int reload_mmaped_from_file_handle(struct libos_handle* hdl);

void debug_print_all_vmas(void);

/* Returns the peak amount of memory usage */
Expand Down
116 changes: 116 additions & 0 deletions libos/src/bookkeep/libos_vma.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ static void copy_vma(struct libos_vma* old_vma, struct libos_vma* new_vma) {
new_vma->flags = old_vma->flags;
new_vma->file = old_vma->file;
if (new_vma->file) {
if (new_vma->file->inode)
(void)__atomic_add_fetch(&new_vma->file->inode->num_mmapped, 1, __ATOMIC_RELAXED);
get_handle(new_vma->file);
}
new_vma->offset = old_vma->offset;
Expand Down Expand Up @@ -512,6 +514,12 @@ static struct libos_vma* alloc_vma(void) {

static void free_vma(struct libos_vma* vma) {
if (vma->file) {
if (vma->file->inode) {
uint64_t old_num_mmapped = __atomic_fetch_sub(&vma->file->inode->num_mmapped, 1,
__ATOMIC_RELAXED);
assert(old_num_mmapped > 0);
(void)old_num_mmapped;
}
put_handle(vma->file);
}

Expand Down Expand Up @@ -800,6 +808,8 @@ int bkeep_mmap_fixed(void* addr, size_t length, int prot, int flags, struct libo
new_vma->file = file;
if (new_vma->file) {
get_handle(new_vma->file);
if (new_vma->file->inode)
(void)__atomic_add_fetch(&new_vma->file->inode->num_mmapped, 1, __ATOMIC_RELAXED);
}
new_vma->offset = file ? offset : 0;
copy_comment(new_vma, comment ?: "");
Expand Down Expand Up @@ -1049,6 +1059,8 @@ int bkeep_mmap_any_in_range(void* _bottom_addr, void* _top_addr, size_t length,
new_vma->file = file;
if (new_vma->file) {
get_handle(new_vma->file);
if (new_vma->file->inode)
(void)__atomic_add_fetch(&new_vma->file->inode->num_mmapped, 1, __ATOMIC_RELAXED);
}
new_vma->offset = file ? offset : 0;
copy_comment(new_vma, comment ?: "");
Expand Down Expand Up @@ -1361,6 +1373,110 @@ static bool vma_filter_needs_msync(struct libos_vma* vma, void* arg) {
return true;
}

static bool vma_filter_needs_reload(struct libos_vma* vma, void* arg) {
struct libos_handle* hdl = arg;
assert(hdl && hdl->inode); /* guaranteed to have inode because invoked from `write` callback */

if (vma->flags & (VMA_UNMAPPED | VMA_INTERNAL | MAP_ANONYMOUS | MAP_PRIVATE))
return false;

assert(vma->file); /* check above filtered out non-file-backed mappings */

if (!vma->file->inode || vma->file->inode != hdl->inode)
return false;

if (!vma->file->fs || !vma->file->fs->fs_ops || !vma->file->fs->fs_ops->read)
return false;

if (!(vma->file->acc_mode & MAY_READ))
return false;

return true;
}

static int reload_vma(struct libos_vma_info* vma_info) {
int ret;
struct libos_handle* file = vma_info->file;
assert(file && file->fs && file->fs->fs_ops && file->fs->fs_ops->read);

/* NOTE: Unfortunately there's a data race here: the memory can be unmapped, or remapped, by
* another thread by the time we get to `read`. */
uintptr_t read_begin = (uintptr_t)vma_info->addr;
uintptr_t read_end = (uintptr_t)vma_info->addr + vma_info->length;
assert(IS_ALLOC_ALIGNED(read_begin));
assert(IS_ALLOC_ALIGNED(read_end));

size_t size = read_end - read_begin;
size_t read = 0;
file_off_t pos = (file_off_t)vma_info->file_offset;
pal_prot_flags_t pal_prot = LINUX_PROT_TO_PAL(vma_info->prot, vma_info->flags);
pal_prot_flags_t pal_prot_writable = pal_prot | PAL_PROT_WRITE;

if (pal_prot != pal_prot_writable) {
/* make the area writable so that it can be reloaded */
ret = PalVirtualMemoryProtect((void*)read_begin, size, pal_prot_writable);
if (ret < 0)
return pal_to_unix_errno(ret);
}

while (read < size) {
size_t to_read = size - read;
ssize_t count = file->fs->fs_ops->read(file, (void*)(read_begin + read), to_read, &pos);
if (count < 0) {
if (count == -EINTR || count == -EAGAIN) {
continue;
}
ret = count;
goto out;
} else if (count == 0) {
/* it's possible that the underlying file contents do not cover the whole VMA region */
break;
}
assert((size_t)count <= to_read);
read += count;
}

ret = 0;
out:
if (pal_prot != pal_prot_writable) {
/* the area was made writable above; restore the original permissions */
int protect_ret = PalVirtualMemoryProtect((void*)read_begin, size, pal_prot);
if (protect_ret < 0) {
log_error("restore original permissions failed: %s", pal_strerror(protect_ret));
BUG();
}
}

return ret;
}

/* This helper function is to reload the VMA contents of a given file handle on `write`.
*
* NOTE: the `write` callback can be invoked from multiple paths (syscalls like `munmap()`,
* `mmap(MAP_FIXED_NOREPLACE)` and `msync()`) via the `msync` callback, so blindly reloading the VMA
* contents on e.g. `munmap()` can be inefficient (but unmapping file-backed memory regions
* shouldn't be a frequent operation). */
int reload_mmaped_from_file_handle(struct libos_handle* hdl) {
struct libos_vma_info* vma_infos;
size_t count;

int ret = dump_vmas(&vma_infos, &count, /*begin=*/0, /*end=*/UINTPTR_MAX,
vma_filter_needs_reload, hdl);
if (ret < 0)
return ret;

for (size_t i = 0; i < count; i++) {
ret = reload_vma(&vma_infos[i]);
if (ret < 0)
goto out;
}

ret = 0;
out:
free_vma_info_array(vma_infos, count);
return ret;
}

static int msync_all(uintptr_t begin, uintptr_t end, struct libos_handle* hdl) {
assert(IS_ALLOC_ALIGNED(begin));
assert(end == UINTPTR_MAX || IS_ALLOC_ALIGNED(end));
Expand Down
20 changes: 15 additions & 5 deletions libos/src/fs/chroot/encrypted.c
Original file line number Diff line number Diff line change
Expand Up @@ -450,18 +450,28 @@ static ssize_t chroot_encrypted_write(struct libos_handle* hdl, const void* buf,
lock(&hdl->inode->lock);

int ret = encrypted_file_write(enc, buf, count, *pos, &actual_count);
if (ret < 0)
goto out;
if (ret < 0) {
unlock(&hdl->inode->lock);
return ret;
}

assert(actual_count <= count);
*pos += actual_count;
if (hdl->inode->size < *pos)
hdl->inode->size = *pos;

ret = 0;
out:
unlock(&hdl->inode->lock);
return ret < 0 ? ret : (ssize_t)actual_count;

/* If there are any MAP_SHARED mappings for the file, this will read data from `enc`. */
if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) {
ret = reload_mmaped_from_file_handle(hdl);
if (ret < 0) {
log_error("reload mmapped regions of file failed: %s", unix_strerror(ret));
BUG();
}
}

return (ssize_t)actual_count;
}

static int chroot_encrypted_truncate(struct libos_handle* hdl, file_off_t size) {
Expand Down
13 changes: 12 additions & 1 deletion libos/src/fs/chroot/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "libos_internal.h"
#include "libos_lock.h"
#include "libos_utils.h"
#include "libos_vma.h"
#include "linux_abi/errors.h"
#include "linux_abi/fs.h"
#include "linux_abi/memory.h"
Expand Down Expand Up @@ -310,7 +311,17 @@ static ssize_t chroot_write(struct libos_handle* hdl, const void* buf, size_t co
hdl->inode->size = *pos;
unlock(&hdl->inode->lock);
}
return actual_count;

/* If there are any MAP_SHARED mappings for the file, this will read data from `hdl`. */
if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) {
ret = reload_mmaped_from_file_handle(hdl);
if (ret < 0) {
log_error("reload mmapped regions of file failed: %s", unix_strerror(ret));
BUG();
}
}

return (ssize_t)actual_count;
}

static int chroot_mmap(struct libos_handle* hdl, void* addr, size_t size, int prot, int flags,
Expand Down
17 changes: 14 additions & 3 deletions libos/src/fs/tmpfs/fs.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,17 +259,28 @@ static ssize_t tmpfs_write(struct libos_handle* hdl, const void* buf, size_t siz
struct libos_mem_file* mem = inode->data;

ret = mem_file_write(mem, *pos, buf, size);
if (ret < 0)
goto out;
if (ret < 0) {
unlock(&inode->lock);
return ret;
}

inode->size = mem->size;

*pos += ret;
inode->mtime = time_us / USEC_IN_SEC;
/* keep `ret` */

out:
unlock(&inode->lock);

/* If there are any MAP_SHARED mappings for the file, this will read data from `hdl`. */
if (__atomic_load_n(&hdl->inode->num_mmapped, __ATOMIC_ACQUIRE) != 0) {
int reload_ret = reload_mmaped_from_file_handle(hdl);
if (reload_ret < 0) {
log_error("reload mmapped regions of file failed: %s", unix_strerror(reload_ret));
BUG();
}
}

return ret;
}

Expand Down
1 change: 1 addition & 0 deletions libos/test/fs/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ tests = {
'open_close': {},
'open_flags': {},
'read_write': {},
'read_write_mmap': {},
'seek_tell': {},
'seek_tell_truncate': {},
'stat': {},
Expand Down
96 changes: 96 additions & 0 deletions libos/test/fs/read_write_mmap.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#include "common.h"

static void read_write_mmap(const char* file_path) {
const size_t size = 1024 * 1024;
int fd = open_output_fd(file_path, /*rdwr=*/true);
printf("open(%s) RW (mmap) OK\n", file_path);

if (ftruncate(fd, size) == -1) {
close(fd);
fatal_error("ftruncate\n");
}

void* buf_write = alloc_buffer(size);
void* buf_read = alloc_buffer(size);

void* m = mmap_fd(file_path, fd, PROT_READ | PROT_WRITE, 0, size);
printf("mmap_fd(%zu) OK\n", size);
fill_random(m, size);
int ret = msync(m, size, MS_SYNC);
if (ret < 0) {
close(fd);
fatal_error("msync\n");
}

read_fd(file_path, fd, buf_read, size);
printf("read(%s) 1 RW (mmap) OK\n", file_path);
if (memcmp(m, buf_read, size) != 0)
fatal_error("Read data via read() is different from what was written in the mapping\n");

fill_random(buf_write, size);
seek_fd(file_path, fd, 0, SEEK_SET);
printf("seek(%s) 1 RW (mmap) OK\n", file_path);
write_fd(file_path, fd, buf_write, size);
printf("write(%s) RW (mmap) OK\n", file_path);
seek_fd(file_path, fd, 0, SEEK_SET);
printf("seek(%s) 2 RW (mmap) OK\n", file_path);
read_fd(file_path, fd, buf_read, size);
printf("read(%s) 2 RW (mmap) OK\n", file_path);
if (memcmp(buf_write, buf_read, size) != 0)
fatal_error("Read data via read() is different from what was written via write()\n");
if (memcmp(buf_write, m, size) != 0)
fatal_error("Read data in the mapping is different from what was written via write()\n");
printf("compare(%s) RW (mmap) OK\n", file_path);

munmap_fd(file_path, m, size);
printf("munmap_fd(%zu) OK\n", size);
close_fd(file_path, fd);
printf("close(%s) RW (mmap) OK\n", file_path);
free(buf_write);
free(buf_read);
}

static void read_write_mmap_different_fds(const char* file_path) {
const size_t size = 1024 * 1024;
int fd1 = open_output_fd(file_path, /*rdwr=*/true);
printf("open(%s) RW fd1 (mmap) OK\n", file_path);

if (ftruncate(fd1, size) == -1) {
close(fd1);
fatal_error("ftruncate fd1\n");
}

int fd2 = open_output_fd(file_path, /*rdwr=*/true);
printf("open(%s) RW fd2 OK\n", file_path);

void* m = mmap_fd(file_path, fd1, PROT_READ, 0, size);
printf("mmap_fd(%zu) fd1 OK\n", size);

void* buf_write = alloc_buffer(size);
fill_random(buf_write, size);
write_fd(file_path, fd2, buf_write, size);
printf("write(%s) RW fd2 OK\n", file_path);

if (memcmp(m, buf_write, size) != 0)
fatal_error("Read data from the mapping is different from what was written via write() by "
"another fd of the same file\n");

munmap_fd(file_path, m, size);
printf("munmap_fd(%zu) fd1 OK\n", size);
close_fd(file_path, fd1);
printf("close(%s) RW fd1 (mmap) OK\n", file_path);
close_fd(file_path, fd2);
printf("close(%s) RW fd2 OK\n", file_path);
free(buf_write);
}

int main(int argc, char* argv[]) {
if (argc < 2)
fatal_error("Usage: %s <file_path>\n", argv[0]);

setup();
read_write_mmap(argv[1]);
read_write_mmap_different_fds(argv[1]);

return 0;
}

0 comments on commit 1ea3e60

Please sign in to comment.