Skip to content

Commit

Permalink
Add linux-compatible memfd_create
Browse files Browse the repository at this point in the history
memfd_create is effectively a SHM_ANON shm_open(2) mapping with optional
CLOEXEC and file sealing support. This is used by some mesa parts, some
linux libs, and qemu can also take advantage of it and uses the sealing to
prevent resizing the region.

This reimplements shm_open in terms of shm_open2(2) at the same time.

shm_open(2) will be moved to COMPAT12 shortly.

Reviewed by:	markj, kib
Differential Revision:	https://reviews.freebsd.org/D21393
  • Loading branch information
kevans91 committed Sep 25, 2019
1 parent 3988008 commit 575e351
Show file tree
Hide file tree
Showing 9 changed files with 517 additions and 6 deletions.
3 changes: 2 additions & 1 deletion Makefile.inc1
Expand Up @@ -947,7 +947,8 @@ _cleanobj_fast_depend_hack: .PHONY
# Syscall stubs rewritten in C and obsolete MD assembly implementations
# Date SVN Rev Syscalls
# 20180604 r334626 brk sbrk
.for f in brk sbrk
# 20190916 r35XXXX shm_open
.for f in brk sbrk shm_open
@if [ -e "${OBJTOP}/lib/libc/.depend.${f}.o" ] && \
egrep -qw '${f}\.[sS]' ${OBJTOP}/lib/libc/.depend.${f}.o; then \
echo "Removing stale dependencies for ${f} syscall wrappers"; \
Expand Down
1 change: 1 addition & 0 deletions lib/libc/include/libc_private.h
Expand Up @@ -391,6 +391,7 @@ __pid_t __sys_wait6(enum idtype, __id_t, int *, int,
struct __wrusage *, struct __siginfo *);
__ssize_t __sys_write(int, const void *, __size_t);
__ssize_t __sys_writev(int, const struct iovec *, int);
int __sys_shm_open2(const char *, int, __mode_t, int, const char *);

int __libc_sigaction(int, const struct sigaction *,
struct sigaction *) __hidden;
Expand Down
4 changes: 3 additions & 1 deletion lib/libc/sys/Makefile.inc
Expand Up @@ -46,6 +46,7 @@ PSEUDO+= _getdirentries.o

SRCS+= brk.c
SRCS+= pipe.c
SRCS+= shm_open.c
SRCS+= vadvise.c

SRCS+= compat-stub.c
Expand Down Expand Up @@ -475,7 +476,8 @@ MLINKS+=setuid.2 setegid.2 \
setuid.2 seteuid.2 \
setuid.2 setgid.2
MLINKS+=shmat.2 shmdt.2
MLINKS+=shm_open.2 shm_unlink.2
MLINKS+=shm_open.2 memfd_create.3 \
shm_open.2 shm_unlink.2
MLINKS+=sigwaitinfo.2 sigtimedwait.2
MLINKS+=stat.2 fstat.2 \
stat.2 fstatat.2 \
Expand Down
1 change: 1 addition & 0 deletions lib/libc/sys/Symbol.map
Expand Up @@ -409,6 +409,7 @@ FBSD_1.6 {
fhreadlink;
getfhat;
funlinkat;
memfd_create;
};

FBSDprivate_1.0 {
Expand Down
97 changes: 93 additions & 4 deletions lib/libc/sys/shm_open.2
Expand Up @@ -28,11 +28,11 @@
.\"
.\" $FreeBSD$
.\"
.Dd January 20, 2017
.Dd September 24, 2019
.Dt SHM_OPEN 2
.Os
.Sh NAME
.Nm shm_open , shm_unlink
.Nm memfd_create , shm_open , shm_unlink
.Nd "shared memory object operations"
.Sh LIBRARY
.Lb libc
Expand All @@ -41,6 +41,8 @@
.In sys/mman.h
.In fcntl.h
.Ft int
.Fn memfd_create "const char *name" "unsigned int flags"
.Ft int
.Fn shm_open "const char *path" "int flags" "mode_t mode"
.Ft int
.Fn shm_unlink "const char *path"
Expand Down Expand Up @@ -139,14 +141,64 @@ The
.Fn shm_unlink
system call removes a shared memory object named
.Fa path .
.Pp
The
.Fn memfd_create
function creates an anonymous shared memory object, identical to that created
by
.Fn shm_open
when
.Dv SHM_ANON
is specified.
Newly created objects start off with a size of zero.
The size of the new object must be adjusted via
.Xr ftruncate 2 .
.Pp
The
.Fa name
argument must not be
.Dv NULL ,
but it may be an empty string.
The length of the
.Fa name
argument may not exceed
.Dv NAME_MAX
minus six characters for the prefix
.Dq memfd: ,
which will be prepended.
The
.Fa name
argument is intended solely for debugging purposes and will never be used by the
kernel to identify a memfd.
Names are therefore not required to be unique.
.Pp
The following
.Fa flags
may be specified to
.Fn memfd_create :
.Bl -tag -width MFD_ALLOW_SEALING
.It Dv MFD_CLOEXEC
Set
.Dv FD_CLOEXEC
on the resulting file descriptor.
.It Dv MFD_ALLOW_SEALING
Allow adding seals to the resulting file descriptor using the
.Dv F_ADD_SEALS
.Xr fcntl 2
command.
.It Dv MFD_HUGETLB
This flag is currently unsupported.
.El
.Sh RETURN VALUES
If successful,
.Fn memfd_create
and
.Fn shm_open
returns a non-negative integer,
both return a non-negative integer,
and
.Fn shm_unlink
returns zero.
Both functions return -1 on failure, and set
All three functions return -1 on failure, and set
.Va errno
to indicate the error.
.Sh COMPATIBILITY
Expand Down Expand Up @@ -220,6 +272,33 @@ This example fails without the call to
errx(EX_IOERR, "%s: pwrite length mismatch", __func__);
.Ed
.Sh ERRORS
.Fn memfd_create
fails with these error codes for these conditions:
.Bl -tag -width Er
.It Bq Er EBADF
The
.Fa name
argument was NULL.
.It Bq Er EINVAL
The
.Fa name
argument was too long.
.Pp
An invalid or unsupported flag was included in
.Fa flags .
.It Bq Er EMFILE
The process has already reached its limit for open file descriptors.
.It Bq Er ENFILE
The system file table is full.
.It Bq Er ENOSYS
In
.Fa memfd_create ,
.Dv MFD_HUGETLB
was specified in
.Fa flags ,
and this system does not support forced hugetlb mappings.
.El
.Pp
.Fn shm_open
fails with these error codes for these conditions:
.Bl -tag -width Er
Expand Down Expand Up @@ -290,13 +369,23 @@ requires write permission to the shared memory object.
.Xr sendfile 2
.Sh STANDARDS
The
.Fn memfd_create
function is expected to be compatible with the Linux system call of the same
name.
.Pp
The
.Fn shm_open
and
.Fn shm_unlink
functions are believed to conform to
.St -p1003.1b-93 .
.Sh HISTORY
The
.Fn memfd_create
function appeared in
.Fx 13.0 .
.Pp
The
.Fn shm_open
and
.Fn shm_unlink
Expand Down
113 changes: 113 additions & 0 deletions lib/libc/sys/shm_open.c
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice(s), this list of conditions and the following disclaimer as
* the first lines of this file unmodified other than the possible
* addition of one or more copyright notices.
* 2. Redistributions in binary form must reproduce the above copyright
* notice(s), this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
* EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include <sys/types.h>
#include <sys/mman.h>
#include <sys/syscall.h>

#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <unistd.h>
#include <stdio.h>
#include <string.h>

#include "libc_private.h"

__weak_reference(shm_open, _shm_open);
__weak_reference(shm_open, __sys_shm_open);

#ifndef SYS_freebsd12_shm_open
#define SYS_freebsd12_shm_open SYS_shm_open
#endif

#define SHM_OPEN2_OSREL 1300048

#define MEMFD_NAME_PREFIX "memfd:"

int
shm_open(const char *path, int flags, mode_t mode)
{

if (__getosreldate() >= SHM_OPEN2_OSREL)
return (__sys_shm_open2(path, flags | O_CLOEXEC, mode, 0,
NULL));

/*
* Fallback to shm_open(2) on older kernels. The kernel will enforce
* O_CLOEXEC in this interface, unlike the newer shm_open2 which does
* not enforce it. The newer interface allows memfd_create(), for
* instance, to not have CLOEXEC on the returned fd.
*/
return (syscall(SYS_freebsd12_shm_open, path, flags, mode));
}

/*
* The path argument is passed to the kernel, but the kernel doesn't currently
* do anything with it. Linux exposes it in linprocfs for debugging purposes
* only, but our kernel currently will not do the same.
*/
int
memfd_create(const char *name, unsigned int flags)
{
char memfd_name[NAME_MAX + 1];
size_t namelen;
int oflags, shmflags;

if (name == NULL)
return (EBADF);
namelen = strlen(name);
if (namelen + sizeof(MEMFD_NAME_PREFIX) - 1 > NAME_MAX)
return (EINVAL);
if ((flags & ~(MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB |
MFD_HUGE_MASK)) != 0)
return (EINVAL);
/* HUGETLB set with no size specified. */
if ((flags & MFD_HUGETLB) != 0 && (flags & MFD_HUGE_MASK) == 0)
return (EINVAL);
/* Size specified but no HUGETLB. */
if ((flags & MFD_HUGE_MASK) != 0 && (flags & MFD_HUGETLB) == 0)
return (EINVAL);
/* We don't actually support HUGETLB. */
if ((flags & MFD_HUGETLB) != 0)
return (ENOSYS);

/* We've already validated that we're sufficiently sized. */
snprintf(memfd_name, NAME_MAX + 1, "%s%s", MEMFD_NAME_PREFIX, name);
oflags = O_RDWR;
shmflags = 0;
if ((flags & MFD_CLOEXEC) != 0)
oflags |= O_CLOEXEC;
if ((flags & MFD_ALLOW_SEALING) != 0)
shmflags |= SHM_ALLOW_SEALING;
return (__sys_shm_open2(SHM_ANON, oflags, 0, shmflags, memfd_name));
}
27 changes: 27 additions & 0 deletions sys/sys/mman.h
Expand Up @@ -182,6 +182,30 @@
*/
#define SHM_ALLOW_SEALING 0x00000001

/*
* Flags for memfd_create().
*/
#define MFD_ALLOW_SEALING 0x00000001
#define MFD_CLOEXEC 0x00000002

/* UNSUPPORTED */
#define MFD_HUGETLB 0x00000004

#define MFD_HUGE_MASK 0xFC000000
#define MFD_HUGE_SHIFT 26
#define MFD_HUGE_64KB (16 << MFD_HUGE_SHIFT)
#define MFD_HUGE_512KB (19 << MFD_HUGE_SHIFT)
#define MFD_HUGE_1MB (20 << MFD_HUGE_SHIFT)
#define MFD_HUGE_2MB (21 << MFD_HUGE_SHIFT)
#define MFD_HUGE_8MB (23 << MFD_HUGE_SHIFT)
#define MFD_HUGE_16MB (24 << MFD_HUGE_SHIFT)
#define MFD_HUGE_32MB (25 << MFD_HUGE_SHIFT)
#define MFD_HUGE_256MB (28 << MFD_HUGE_SHIFT)
#define MFD_HUGE_512MB (29 << MFD_HUGE_SHIFT)
#define MFD_HUGE_1GB (30 << MFD_HUGE_SHIFT)
#define MFD_HUGE_2GB (31 << MFD_HUGE_SHIFT)
#define MFD_HUGE_16GB (34 << MFD_HUGE_SHIFT)

#endif /* __BSD_VISIBLE */

/*
Expand Down Expand Up @@ -291,6 +315,9 @@ int munlockall(void);
int shm_open(const char *, int, mode_t);
int shm_unlink(const char *);
#endif
#if __BSD_VISIBLE
int memfd_create(const char *, unsigned int);
#endif
__END_DECLS

#endif /* !_KERNEL */
Expand Down
1 change: 1 addition & 0 deletions tests/sys/kern/Makefile
Expand Up @@ -9,6 +9,7 @@ TESTSDIR= ${TESTSBASE}/sys/kern
ATF_TESTS_C+= kern_copyin
ATF_TESTS_C+= kern_descrip_test
ATF_TESTS_C+= kill_zombie
ATF_TESTS_C+= memfd_test
ATF_TESTS_C+= ptrace_test
TEST_METADATA.ptrace_test+= timeout="15"
ATF_TESTS_C+= reaper
Expand Down

0 comments on commit 575e351

Please sign in to comment.