From 6470f8f873e97f10fcb98c20e454895538fb2cff Mon Sep 17 00:00:00 2001 From: Nadav Har'El Date: Wed, 8 Jul 2015 23:09:20 +0300 Subject: [PATCH] fcntl: clean up O_CLOEXEC and friends This patch cleans up fcntl() from assertions that cannot fail and similar nonsense, but more importantly - corrects (somewhat) our bogus treatment of the O_CLOEXEC / FD_CLOEXEC flags. Our treatment of fcntl(fd, F_SETFD, FD_CLOEXEC) is inherently flawed, because we decided to save it as a bit on the open-file flags (fp->f_flags), which means that dup()licate file descriptors will get the same setting of this bit, contrary to the expectation that this bit is per-file-descriptor (and usually unset on the newly dup()ed file descriptor). However, this small flaw shouldn't really matter because the close-on-exec flag really doesn't matter on OSv, which doesn't have exec() at all. Nevertheless, our implementation had an additional, even more serious, bug - namely, we put the bit FD_CLOEXEC itself on f_flags. But FD_CLOEXEC is 1, and has a different meaning in f_flags (1==FREAD)! We need to use the O_CLOEXEC bit, not FD_CLOEXEC... So this patch changes that. Finally, this patch also adds the bit 0100000 to the response from F_GETFL. This bit ("the bit formerly known as O_LARGEFILE") is returned in 64-bit Linux, so we should return it too. This bit is not standardized by POSIX, but the Linux Standard Base documentation of fcntl() claim Linux is within its rights to add it. I am guessing no application depends on it, but better safe than sorry... Also in this patch is a test for all the situations described above, which after this patch passes on both Linux and OSv. One check is commented out in the test, because it still fails on OSv: the check that a dup()ed file descriptor gets its FD_CLOEXEC reset. As explained above, this is not yet supported in OSv (and probably has no practical significance anyway). Signed-off-by: Nadav Har'El Signed-off-by: Avi Kivity --- fs/vfs/main.cc | 24 +++++----- modules/tests/Makefile | 2 +- tests/tst-fcntl.cc | 99 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+), 11 deletions(-) create mode 100644 tests/tst-fcntl.cc diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index 5b0c1f5da1..003193babb 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -1425,8 +1425,6 @@ int dup2(int oldfd, int newfd) */ #define SETFL (O_APPEND | O_ASYNC | O_DIRECT | O_NOATIME | O_NONBLOCK) -#define SETFL_IGNORED (~SETFL) - TRACEPOINT(trace_vfs_fcntl, "%d %d 0x%x", int, int, int); TRACEPOINT(trace_vfs_fcntl_ret, "\"%s\"", int); TRACEPOINT(trace_vfs_fcntl_err, "%d", int); @@ -1443,6 +1441,13 @@ int fcntl(int fd, int cmd, int arg) if (error) goto out_errno; + // An important note about our handling of FD_CLOEXEC / O_CLOEXEC: + // close-on-exec shouldn't have been a file flag (fp->f_flags) - it is a + // file descriptor flag, meaning that that two dup()ed file descriptors + // could have different values for FD_CLOEXEC. Our current implementation + // *wrongly* makes close-on-exec an f_flag (using the bit O_CLOEXEC). + // There is little practical difference, though, because this flag is + // ignored in OSv anyway, as it doesn't support exec(). switch (cmd) { case F_DUPFD: error = _fdalloc(fp, &ret, arg); @@ -1450,22 +1455,21 @@ int fcntl(int fd, int cmd, int arg) goto out_errno; break; case F_GETFD: - ret = fp->f_flags & FD_CLOEXEC; + ret = (fp->f_flags & O_CLOEXEC) ? FD_CLOEXEC : 0; break; case F_SETFD: FD_LOCK(fp); - fp->f_flags = (fp->f_flags & ~FD_CLOEXEC) | - (arg & FD_CLOEXEC); + fp->f_flags = (fp->f_flags & ~O_CLOEXEC) | + ((arg & FD_CLOEXEC) ? O_CLOEXEC : 0); FD_UNLOCK(fp); break; case F_GETFL: - ret = oflags(fp->f_flags); + // As explained above, the O_CLOEXEC should have been in f_flags, + // and shouldn't be returned. Linux always returns 0100000 ("the + // flag formerly known as O_LARGEFILE) so let's do it too. + ret = (oflags(fp->f_flags) & ~O_CLOEXEC) | 0100000; break; case F_SETFL: - /* Ignore flags */ - arg &= ~SETFL_IGNORED; - - assert((arg & ~SETFL) == 0); FD_LOCK(fp); fp->f_flags = fflags((oflags(fp->f_flags) & ~SETFL) | (arg & SETFL)); diff --git a/modules/tests/Makefile b/modules/tests/Makefile index 86606c608b..b12acc23e6 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -79,7 +79,7 @@ tests := tst-pthread.so tst-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-pthread-affinity.so tst-pthread-tsd.so tst-thread-local.so \ tst-zfs-mount.so tst-regex.so tst-tcp-siocoutq.so \ libtls.so tst-tls.so tst-select-timeout.so tst-faccessat.so \ - tst-fstatat.so misc-reboot.so + tst-fstatat.so misc-reboot.so tst-fcntl.so # libstatic-thread-variable.so tst-static-thread-variable.so \ diff --git a/tests/tst-fcntl.cc b/tests/tst-fcntl.cc new file mode 100644 index 0000000000..a4beaade6b --- /dev/null +++ b/tests/tst-fcntl.cc @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2015 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +// Tests for various fcntl() issues behave as they do on Linux. +// +// To compile on Linux, use: c++ -g -pthread -std=c++11 tst-fcntl.cc + + +#include +#include + +#include +#include +#include + +static int tests = 0, fails = 0; + +static void report(bool ok, std::string msg) +{ + ++tests; + fails += !ok; + std::cout << (ok ? "PASS" : "FAIL") << ": " << msg << "\n"; +} + +int main(int ac, char** av) +{ + int fd = open("/tmp/tst-fcntl", O_CREAT | O_RDWR, 0777); + report(fd > 0, "open"); + + // GETFL on a default opened file should return the O_RDWR, plus + // 0100000 (O_LARGEFILE, although this macro doesn't work). + // The latter is not required by POSIX, but is returned on 64-bit + // Linux, and allowed by POSIX (as explained in the Linux Standard + // Base documentation for fcntl. + int r = fcntl(fd, F_GETFL); + report((r & O_ACCMODE) == O_RDWR, "GETFL access mode"); + report((r & ~O_ACCMODE) == 0100000, "GETFL rest"); + int save_r = r; + + // Set with F_SETFL the O_NONBLOCK mode + r = fcntl(fd, F_SETFL, save_r | O_NONBLOCK); + report(r == 0, "F_SETFL O_NONBLOCK"); + r = fcntl(fd, F_GETFL); + report(r == (save_r | O_NONBLOCK), "F_GETFL"); + save_r = r; + + // GETFD on a default opened file should return 0 + r = fcntl(fd, F_GETFD); + report(r == 0, "GETFD default"); + + + // SETFD with close on exec + r = fcntl(fd, F_SETFD, FD_CLOEXEC); + report(r == 0, "SETFD FD_CLOEXEC"); + + // GETFD should now return FD_CLOEXEC + r = fcntl(fd, F_GETFD); + report(r == FD_CLOEXEC, "GETFD"); + + // GETFL should be unchanged by this setting of SETFD + r = fcntl(fd, F_GETFL); + report(r == save_r, "GETFL"); + + // dup() should have the same GETFL, but different GETFD + // (back to the default) + int dupfd = dup(fd); + r = fcntl(dupfd, F_GETFL); + report(r == save_r, "GETFL dup"); +#if 0 + // Unfortunately, the following test does not pass on OSv, because it + // currently makes FD_CLOEXEC per open file, instead of per-fd. + r = fcntl(dupfd, F_GETFD); + report(r == 0, "GETFD dup"); +#endif + close(dupfd); + + // O_CLOEXEC is *not* a file bit. We shouldn't be able to set + // it with F_SETFL (Linux just ignores it), but when we get the + // flags with F_GETFL, we won't see this bogus bit. + r = fcntl(fd, F_SETFL, save_r | O_CLOEXEC); + report(r == 0, "SETFL bogus"); + r = fcntl(fd, F_GETFL); + report(r == save_r, "GETFL bogus"); + + close(fd); + remove("/tmp/tst-fcntl"); + + report(fcntl(fd, F_GETFL) == -1 && errno == EBADF, "fcntl on closed fd"); + + + std::cout << "SUMMARY: " << tests << " tests, " << fails << " failures\n"; + return fails; +} + +