diff --git a/.mailmap b/.mailmap index d8c3f594d1..6f046b972d 100644 --- a/.mailmap +++ b/.mailmap @@ -1,6 +1,8 @@ Stanislav Kinsbursky Pavel Emelyanov -Andrey Vagin -Andrey Vagin -Andrey Vagin Andrew Vagin +Andrei Vagin +Andrei Vagin +Andrei Vagin +Andrei Vagin +Andrei Vagin Cyrill Gorcunov diff --git a/.travis.yml b/.travis.yml index 82ba9fbc8f..9928f16c24 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ language: c -sudo: required -dist: xenial +os: linux +dist: bionic cache: ccache services: - docker @@ -9,35 +9,98 @@ env: - TR_ARCH=local CLANG=1 - TR_ARCH=local COMPAT_TEST=y - TR_ARCH=local CLANG=1 COMPAT_TEST=y - - TR_ARCH=alpine - - TR_ARCH=fedora-asan - TR_ARCH=x86_64 - TR_ARCH=x86_64 CLANG=1 - - TR_ARCH=armv7hf - - TR_ARCH=aarch64 - - TR_ARCH=ppc64le - - TR_ARCH=s390x - - TR_ARCH=armv7hf CLANG=1 - - TR_ARCH=aarch64 CLANG=1 - - TR_ARCH=ppc64le CLANG=1 - - TR_ARCH=alpine CLANG=1 - - TR_ARCH=docker-test - - TR_ARCH=fedora-rawhide - - TR_ARCH=fedora-rawhide-aarch64 - - TR_ARCH=centos - - TR_ARCH=podman-test -matrix: + - TR_ARCH=openj9-test +jobs: + include: + - os: linux + arch: ppc64le + env: TR_ARCH=local + dist: bionic + - os: linux + arch: ppc64le + env: TR_ARCH=local CLANG=1 + dist: bionic + - os: linux + arch: s390x + env: TR_ARCH=local + dist: bionic + - os: linux + arch: arm64 + env: TR_ARCH=local + dist: bionic + - os: linux + arch: arm64 + env: TR_ARCH=local CLANG=1 + dist: bionic + - os: linux + arch: arm64 + # This runs on aarch64 with 'setarch linux32' + env: TR_ARCH=armv7hf + dist: bionic + - os: linux + arch: arm64 + # This runs on aarch64 with 'setarch linux32' + env: TR_ARCH=armv7hf CLANG=1 + dist: bionic + - os: linux + arch: arm64 + env: TR_ARCH=fedora-rawhide + dist: bionic + - os: linux + arch: amd64 + env: TR_ARCH=fedora-rawhide + dist: xenial # test hangs on bionic + - os: linux + arch: amd64 + env: TR_ARCH=podman-test + dist: bionic + - os: linux + arch: amd64 + env: TR_ARCH=docker-test + dist: bionic + - os: linux + arch: amd64 + env: TR_ARCH=docker-test DIST=xenial + # On xenial it should be possible to test overlayfs; + # broken on the latest bionic kernel + dist: xenial + - os: linux + arch: amd64 + env: TR_ARCH=alpine CLANG=1 + dist: xenial # test hangs on bionic + - os: linux + arch: amd64 + env: TR_ARCH=alpine + dist: xenial # test hangs on bionic + - os: linux + arch: amd64 + env: TR_ARCH=centos + dist: xenial # test hangs on bionic + - os: linux + arch: amd64 + env: TR_ARCH=fedora-asan + dist: xenial # test hangs on bionic + - os: linux + arch: amd64 + env: TR_ARCH=armv7-cross + dist: bionic + - os: linux + arch: amd64 + env: TR_ARCH=aarch64-cross + dist: bionic + - os: linux + arch: amd64 + env: TR_ARCH=ppc64-cross + dist: bionic allow_failures: - env: TR_ARCH=docker-test + - env: TR_ARCH=docker-test DIST=xenial - env: TR_ARCH=fedora-rawhide - - env: TR_ARCH=fedora-rawhide-aarch64 - - env: TR_ARCH=s390x - env: TR_ARCH=local GCOV=1 - - env: TR_ARCH=local COMPAT_TEST=y - - env: TR_ARCH=local CLANG=1 COMPAT_TEST=y script: - sudo make CCACHE=1 -C scripts/travis $TR_ARCH after_success: - ccache -s - make -C scripts/travis after_success -group: deprecated-2017Q2 diff --git a/Documentation/HOWTO.cross-compile b/Documentation/HOWTO.cross-compile index f1b17842b8..44b19dfea8 100644 --- a/Documentation/HOWTO.cross-compile +++ b/Documentation/HOWTO.cross-compile @@ -1,4 +1,10 @@ -This HOWTO explains how to cross-compile CRIU on x86 +How to cross-compile CRIU on x86: + +Use the Dockerfile provided: + scripts/build/Dockerfile.armv7-cross + +Historical guide how-to do it without docker container: +[Unsupported, may not work anymore!] 1. Download the protobuf sources. 2. Apply the patch http://16918.selcdn.ru/crtools/aarch64/0001-protobuf-added-the-support-for-the-acrchitecture-AAr.patch diff --git a/Documentation/Makefile b/Documentation/Makefile index cbc7ff2c81..5025e2b992 100644 --- a/Documentation/Makefile +++ b/Documentation/Makefile @@ -54,7 +54,7 @@ ifneq ($(USE_ASCIIDOCTOR),) $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< else $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.1,%.xml,$@) $< - $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.1,%.xml,$@) 2>/dev/null + $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.1,%.xml,$@) endif %.8: %.txt $(FOOTER) custom.xsl @@ -63,7 +63,7 @@ ifneq ($(USE_ASCIIDOCTOR),) $(Q) $(ASCIIDOC) -b manpage -d manpage -o $@ $< else $(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $< - $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.8,%.xml,$@) 2>/dev/null + $(Q) $(XMLTO) man -m custom.xsl $(patsubst %.8,%.xml,$@) endif %.ps: %.1 diff --git a/Documentation/compel.txt b/Documentation/compel.txt index 744a3b35db..6ccd208615 100644 --- a/Documentation/compel.txt +++ b/Documentation/compel.txt @@ -86,7 +86,7 @@ Infecting code ~~~~~~~~~~~~~~ The parasitic code is compiled and converted to a header using *compel*, and included here. -*#include * +*#include * *#include "parasite.h"* diff --git a/Documentation/criu.txt b/Documentation/criu.txt index 94fc5428a3..ab63e461c7 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -24,8 +24,8 @@ on a different system, or both. OPTIONS ------- -Most of the true / false long options (the ones without arguments) can be -prefixed with *--no-* to negate the option (example: *--display-stats* +Most of the long flags can be +prefixed with *no-* to negate the option (example: *--display-stats* and *--no-display-stats*). Common options @@ -33,12 +33,11 @@ Common options Common options are applicable to any 'command'. *-v*[*v*...], *--verbosity*:: - Increase verbosity up from the default level. Multiple *v* can be used, - each increasing verbosity by one level. Using long option without argument - increases verbosity by one level. + Increase verbosity up from the default level. In case of short option, + multiple *v* can be used, each increasing verbosity by one. -*-v*'num', *--verbosity*='num':: - Set verbosity level to 'num'. The higher the level, the more output +**-v**__num__, **--verbosity=**__num__:: + Set verbosity level to _num_. The higher the level, the more output is produced. + The following levels are available: @@ -57,22 +56,22 @@ The following levels are available: Pass a specific configuration file to criu. *--no-default-config*:: - Forbid parsing of default configuration files. + Disable parsing of default configuration files. *--pidfile* 'file':: Write root task, service or page-server pid into a 'file'. *-o*, *--log-file* 'file':: - Write logging messages to 'file'. + Write logging messages to a 'file'. *--display-stats*:: - During dump as well as during restore *criu* collects information - like the time required to dump or restore the process or the + During dump, as well as during restore, *criu* collects some statistics, + like the time required to dump or restore the process, or the number of pages dumped or restored. This information is always - written to the files 'stats-dump' and 'stats-restore' and can - be easily displayed using *crit*. The option *--display-stats* - additionally prints out this information on the console at the end - of a dump or a restore. + saved to the *stats-dump* and *stats-restore* files, and can + be shown using *crit*(1). The option *--display-stats* + prints out this information on the console at the end + of a dump or restore operation. *-D*, *--images-dir* 'path':: Use 'path' as a base directory where to look for sets of image files. @@ -91,6 +90,19 @@ The following levels are available: *-L*, *--libdir* 'path':: Path to plugins directory. +*--enable-fs* ['fs'[,'fs'...]]:: + Specify a comma-separated list of filesystem names that should + be auto-detected. The value 'all' enables auto-detection for + all filesystems. ++ +Note: This option is not safe, use at your own risk. +Auto-detecting a filesystem mount assumes that the mountpoint can +be restored with *mount(src, mountpoint, flags, options)*. When used, +*dump* is expected to always succeed if a mountpoint is to be +auto-detected, however *restore* may fail (or do something wrong) +if the assumption for restore logic is incorrect. This option is +not compatable with *--external* *dev*. + *--action-script* 'script':: Add an external action script to be executed at certain stages. The environment variable *CRTOOLS_SCRIPT_ACTION* is available @@ -156,6 +168,12 @@ In addition, *page-server* options may be specified. Turn on memory changes tracker in the kernel. If the option is not passed the memory tracker get turned on implicitly. +*--pre-dump-mode*='mode':: + There are two 'mode' to operate pre-dump algorithm. The 'splice' mode + is parasite based, whereas 'read' mode is based on process_vm_readv + syscall. The 'read' mode incurs reduced frozen time and reduced + memory pressure as compared to 'splice' mode. Default is 'splice' mode. + *dump* ~~~~~~ Performs a checkpoint procedure. @@ -179,7 +197,7 @@ In other words, do not use it unless really needed. *-s*, *--leave-stopped*:: Leave tasks in stopped state after checkpoint, instead of killing. -*--external* 'type'*[*'id'*]:*'value':: +*--external* __type__**[**__id__**]:**__value__:: Dump an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal square brackets), and optional 'value' (prepended by a literal colon). @@ -188,35 +206,36 @@ In other words, do not use it unless really needed. Note to restore external resources, either *--external* or *--inherit-fd* is used, depending on resource type. -*--external mnt[*'mountpoint'*]:*'name':: +*--external* **mnt[**__mountpoint__**]:**__name__:: Dump an external bind mount referenced by 'mountpoint', saving it to image under the identifier 'name'. -*--external mnt[]:*'flags':: +*--external* **mnt[]:**__flags__:: Dump all external bind mounts, autodetecting those. Optional 'flags' can contain *m* to also dump external master mounts, *s* to also dump external shared mounts (default behavior is to abort dumping if such mounts are found). If 'flags' are not provided, colon is optional. -*--external dev[*'major'*/*'minor'*]:*'name':: +*--external* **dev[**__major__**/**__minor__**]:**__name__:: Allow to dump a mount namespace having a real block device mounted. A block device is identified by its 'major' and 'minor' numbers, and *criu* saves its information to image under the identifier 'name'. -*--external file[*'mnt_id'*:*'inode'*]*:: +*--external* **file[**__mnt_id__**:**__inode__**]**:: Dump an external file, i.e. an opened file that is can not be resolved from the current mount namespace, which can not be dumped without using this option. The file is identified by 'mnt_id' (a field obtained from - */proc/*'pid'*/fdinfo/*'N') and 'inode' (as returned by *stat*(2)). + **/proc/**__pid__**/fdinfo/**__N__) and 'inode' (as returned by + *stat*(2)). -*--external tty[*'rdev'*:*'dev'*]*:: +*--external* **tty[**__rdev__**:**__dev__**]**:: Dump an external TTY, identified by *st_rdev* and *st_dev* fields returned by *stat*(2). -*--external unix[*'id'*]*:: +*--external* **unix[**__id__**]**:: Tell *criu* that one end of a pair of UNIX sockets (created by - *socketpair*(2)) with 'id' is OK to be disconnected. + *socketpair*(2)) with the given _id_ is OK to be disconnected. *--freeze-cgroup*:: Use cgroup freezer to collect processes. @@ -266,10 +285,33 @@ For example, the command line for the above example should look like this: discovered automatically (usually via */proc*). This option is useful when one needs *criu* to skip some controllers. -*--cgroup-props-ignore-default*:: - When combined with *--cgroup-props*, makes *criu* substitute - a predefined controller property with the new one shipped. If the option - is not used, the predefined properties are merged with the provided ones. +*--cgroup-yard* 'path':: + Instead of trying to mount cgroups in CRIU, provide a path to a directory + with already created cgroup yard. Useful if you don't want to grant + CAP_SYS_ADMIN to CRIU. For every cgroup mount there should be exactly one + directory. If there is only one controller in this mount, the dir's name + should be just the name of the controller. If there are multiple controllers + comounted, the directory name should have them be separated by a comma. ++ +For example, if */proc/cgroups* looks like this: ++ +---------- +#subsys_name hierarchy num_cgroups enabled +cpu 1 1 1 +devices 2 2 1 +freezer 2 2 1 +---------- ++ +then you can create the cgroup yard by the following commands: ++ +---------- +mkdir private_yard +cd private_yard +mkdir cpu +mount -t cgroup -o cpu none cpu +mkdir devices,freezer +mount -t cgroup -o devices,freezer none devices,freezer +---------- *--tcp-established*:: Checkpoint established TCP connections. @@ -351,7 +393,7 @@ By default the option is set to *fpu* and *ins*. ~~~~~~~~~ Restores previously checkpointed processes. -*--inherit-fd* *fd[*'N'*]:*'resource':: +*--inherit-fd* **fd[**__N__**]:**__resource__:: Inherit a file descriptor. This option lets *criu* use an already opened file descriptor 'N' for restoring a file identified by 'resource'. This option can be used to restore an external resource dumped @@ -359,10 +401,10 @@ Restores previously checkpointed processes. + The 'resource' argument can be one of the following: + - - *tty[*'rdev'*:*'dev'*]* - - *pipe[*'inode'*]* - - *socket[*'inode'*]* - - *file[*'mnt_id'*:*'inode'*]* + - **tty[**__rdev__**:**__dev__**]** + - **pipe[**__inode__**]** + - **socket[**__inode__*]* + - **file[**__mnt_id__**:**__inode__**]** - 'path/to/file' + @@ -385,8 +427,10 @@ usually need to be escaped from shell. *-r*, *--root* 'path':: Change the root filesystem to 'path' (when run in a mount namespace). + This option is required to restore a mount namespace. The directory + 'path' must be a mount point and its parent must not be overmounted. -*--external* 'type'*[*'id'*]:*'value':: +*--external* __type__**[**__id__**]:**__value__:: Restore an instance of an external resource. The generic syntax is 'type' of resource, followed by resource 'id' (enclosed in literal square brackets), and optional 'value' (prepended by a literal colon). @@ -396,7 +440,7 @@ usually need to be escaped from shell. the help of *--external* *file*, *tty*, and *unix* options), option *--inherit-fd* should be used. -*--external mnt[*'name'*]:*'mountpoint':: +*--external* **mnt[**__name__**]:**__mountpoint__:: Restore an external bind mount referenced in the image by 'name', bind-mounting it from the host 'mountpoint' to a proper mount point. @@ -404,17 +448,17 @@ usually need to be escaped from shell. Restore all external bind mounts (dumped with the help of *--external mnt[]* auto-detection). -*--external dev[*'name'*]:*'/dev/path':: +*--external* **dev[**__name__**]:**__/dev/path__:: Restore an external mount device, identified in the image by 'name', using the existing block device '/dev/path'. -*--external veth[*'inner_dev'*]:*'outer_dev'*@*'bridge':: +*--external* **veth[**__inner_dev__**]:**__outer_dev__**@**__bridge__:: Set the outer VETH device name (corresponding to 'inner_dev' being - restored) to 'outer_dev'. If optional *@*'bridge' is specified, + restored) to 'outer_dev'. If optional **@**_bridge_ is specified, 'outer_dev' is added to that bridge. If the option is not used, 'outer_dev' will be autogenerated by the kernel. -*--external macvlan[*'inner_dev'*]:*'outer_dev':: +*--external* **macvlan[**__inner_dev__**]:**__outer_dev__:: When restoring an image that have a MacVLAN device in it, this option must be used to specify to which 'outer_dev' (an existing network device in CRIU namespace) the restored 'inner_dev' should be bound to. @@ -433,7 +477,7 @@ The 'mode' may be one of the following: *soft*::: Restore cgroup properties if only cgroup has been created by *criu*, otherwise do not restore properties. This is the - default if mode is unspecified. + default if mode is unspecified. *full*::: Always restore all cgroups and their properties. @@ -442,6 +486,11 @@ The 'mode' may be one of the following: *ignore*::: Don't deal with cgroups and pretend that they don't exist. +*--cgroup-yard* 'path':: + Instead of trying to mount cgroups in CRIU, provide a path to a directory + with already created cgroup yard. For more information look in the *dump* + section. + *--cgroup-root* ['controller'*:*]/'newroot':: Change the root cgroup the controller will be installed into. No controller means that root is the default for all controllers not specified. @@ -454,14 +503,14 @@ The 'mode' may be one of the following: *--tcp-close*:: Restore connected TCP sockets in closed state. -*--veth-pair* 'IN'*=*'OUT':: +*--veth-pair* __IN__**=**__OUT__:: Correspondence between outside and inside names of veth devices. *-l*, *--file-locks*:: Restore file locks from the image. -*--lsm-profile* 'type'*:*'name':: - Specify an LSM profile to be used during restore. The `type` can be +*--lsm-profile* __type__**:**__name__:: + Specify an LSM profile to be used during restore. The _type_ can be either *apparmor* or *selinux*. *--auto-dedup*:: @@ -526,17 +575,17 @@ check* always checks Category 1 features unless *--feature* is specified which only checks a specified feature. *Category 1*::: Absolutely required. These are features like support for - */proc/PID/map_files*, *NETLINK_SOCK_DIAG* socket - monitoring, */proc/sys/kernel/ns_last_pid* etc. + */proc/PID/map_files*, *NETLINK_SOCK_DIAG* socket + monitoring, */proc/sys/kernel/ns_last_pid* etc. *Category 2*::: Required only for specific cases. These are features - like AIO remap, */dev/net/tun* and others that are only - required if a process being dumped or restored - is using those. + like AIO remap, */dev/net/tun* and others that are only + required if a process being dumped or restored + is using those. *Category 3*::: Experimental. These are features like *task-diag* that - are used for experimental purposes (mostly - during development). + are used for experimental purposes (mostly + during development). If there are no errors or warnings, *criu* prints "Looks good." and its exit code is 0. diff --git a/Makefile b/Makefile index 0140330e14..00e563c113 100644 --- a/Makefile +++ b/Makefile @@ -17,8 +17,6 @@ ifeq ($(origin HOSTCFLAGS), undefined) HOSTCFLAGS := $(CFLAGS) $(USERCFLAGS) endif -UNAME-M := $(shell uname -m) - # # Supported Architectures ifneq ($(filter-out x86 arm aarch64 ppc64 s390,$(ARCH)),) @@ -27,15 +25,14 @@ endif # The PowerPC 64 bits architecture could be big or little endian. # They are handled in the same way. -ifeq ($(UNAME-M),ppc64) +ifeq ($(SUBARCH),ppc64) error := $(error ppc64 big endian is not yet supported) endif # # Architecture specific options. ifeq ($(ARCH),arm) - ARMV := $(shell echo $(UNAME-M) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') - DEFINES := -DCONFIG_ARMV$(ARMV) -DCONFIG_VDSO_32 + ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') ifeq ($(ARMV),6) USERCFLAGS += -march=armv6 @@ -45,6 +42,16 @@ ifeq ($(ARCH),arm) USERCFLAGS += -march=armv7-a endif + ifeq ($(ARMV),8) + # Running 'setarch linux32 uname -m' returns armv8l on travis aarch64. + # This tells CRIU to handle armv8l just as armv7hf. Right now this is + # only used for compile testing. No further verification of armv8l exists. + USERCFLAGS += -march=armv7-a + ARMV := 7 + endif + + DEFINES := -DCONFIG_ARMV$(ARMV) -DCONFIG_VDSO_32 + PROTOUFIX := y # For simplicity - compile code in Arm mode without interwork. # We could choose Thumb mode as default instead - but a dirty @@ -77,7 +84,6 @@ endif # commit "S/390: Fix 64 bit sibcall". ifeq ($(ARCH),s390) ARCH := s390 - SRCARCH := s390 DEFINES := -DCONFIG_S390 CFLAGS_PIE := -fno-optimize-sibling-calls endif @@ -85,7 +91,7 @@ endif CFLAGS_PIE += -DCR_NOGLIBC export CFLAGS_PIE -LDARCH ?= $(SRCARCH) +LDARCH ?= $(ARCH) export LDARCH export PROTOUFIX DEFINES @@ -94,7 +100,7 @@ export PROTOUFIX DEFINES DEFINES += -D_FILE_OFFSET_BITS=64 DEFINES += -D_GNU_SOURCE -WARNINGS := -Wall -Wformat-security +WARNINGS := -Wall -Wformat-security -Wdeclaration-after-statement -Wstrict-prototypes CFLAGS-GCOV := --coverage -fno-exceptions -fno-inline -fprofile-update=atomic export CFLAGS-GCOV diff --git a/Makefile.config b/Makefile.config index 1e4352b9de..98ba5d892b 100644 --- a/Makefile.config +++ b/Makefile.config @@ -23,6 +23,23 @@ else $(info Note: Building without GnuTLS support) endif +ifeq ($(call pkg-config-check,libnftables),y) + LIB_NFTABLES := $(shell pkg-config --libs libnftables) + ifeq ($(call try-cc,$(FEATURE_TEST_NFTABLES_LIB_API_0),$(LIB_NFTABLES)),true) + LIBS_FEATURES += $(LIB_NFTABLES) + FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_0 + else ifeq ($(call try-cc,$(FEATURE_TEST_NFTABLES_LIB_API_1),$(LIB_NFTABLES)),true) + LIBS_FEATURES += $(LIB_NFTABLES) + FEATURE_DEFINES += -DCONFIG_HAS_NFTABLES_LIB_API_1 + else + $(warning Warn: you have libnftables installed but it has incompatible API) + $(warning Warn: Building without nftables support) + endif +else + $(warning Warn: you have no libnftables installed) + $(warning Warn: Building without nftables support) +endif + export LIBS += $(LIBS_FEATURES) CONFIG_FILE = .config @@ -30,7 +47,7 @@ CONFIG_FILE = .config $(CONFIG_FILE): touch $(CONFIG_FILE) -ifeq ($(SRCARCH),x86) +ifeq ($(ARCH),x86) # CONFIG_COMPAT is only for x86 now, no need for compile-test other archs ifeq ($(call try-asm,$(FEATURE_TEST_X86_COMPAT)),true) export CONFIG_COMPAT := y @@ -47,7 +64,7 @@ export DEFINES += $(FEATURE_DEFINES) export CFLAGS += $(FEATURE_DEFINES) FEATURES_LIST := TCP_REPAIR STRLCPY STRLCAT PTRACE_PEEKSIGINFO \ - SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW + SETPROCTITLE_INIT MEMFD TCP_REPAIR_WINDOW FSCONFIG MEMFD_CREATE # $1 - config name define gen-feature-test diff --git a/README.md b/README.md index 16e8452b55..6a578b9530 100644 --- a/README.md +++ b/README.md @@ -63,11 +63,11 @@ Linux kernel supporting checkpoint and restore for all the features it provides. looking for contributors of all kinds -- feedback, bug reports, testing, coding, writing, etc. Here are some useful hints to get involved. -* We have both -- [very simple](https://checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; -* CRIU does need [extensive testing](https://checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); +* We have both -- [very simple](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Aenhancement) and [more sophisticated](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3A%22new+feature%22) coding tasks; +* CRIU does need [extensive testing](https://github.com/checkpoint-restore/criu/issues?q=is%3Aissue+is%3Aopen+label%3Atesting); * Documentation is always hard, we have [some information](https://criu.org/Category:Empty_articles) that is to be extracted from people's heads into wiki pages as well as [some texts](https://criu.org/Category:Editor_help_needed) that all need to be converted into useful articles; * Feedback is expected on the github issues page and on the [mailing list](https://lists.openvz.org/mailman/listinfo/criu); -* For historical reasons we do not accept PRs, instead [patches are welcome](http://criu.org/How_to_submit_patches); +* We accept github pull requests and this is the preferred way to contribute to CRIU. If you prefer to send patches by email, you are welcome to send them to [the devel list](http://criu.org/How_to_submit_patches); * Spread the word about CRIU in [social networks](http://criu.org/Contacts); * If you're giving a talk about CRIU -- let us know, we'll mention it on the [wiki main page](https://criu.org/News/events); diff --git a/compel/arch/aarch64/src/lib/handle-elf.c b/compel/arch/aarch64/src/lib/handle-elf.c index 1c3686c484..1ee65ee2ca 100644 --- a/compel/arch/aarch64/src/lib/handle-elf.c +++ b/compel/arch/aarch64/src/lib/handle-elf.c @@ -1,6 +1,5 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" diff --git a/compel/arch/aarch64/src/lib/include/syscall.h b/compel/arch/aarch64/src/lib/include/syscall.h index e2ec1272ec..30290667af 100644 --- a/compel/arch/aarch64/src/lib/include/syscall.h +++ b/compel/arch/aarch64/src/lib/include/syscall.h @@ -1,4 +1,4 @@ #ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) ({ (void)compat; __NR_##syscall; }) #endif diff --git a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h index 4662f76897..7a33baa8ef 100644 --- a/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/aarch64/src/lib/include/uapi/asm/infect-types.h @@ -27,6 +27,6 @@ typedef struct user_fpsimd_state user_fpregs_struct_t; #define ARCH_SI_TRAP TRAP_BRKPT -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) ({ (void)compat; __NR_##syscall; }) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def index 721ff16dc0..f7ebc85278 100644 --- a/compel/arch/arm/plugins/std/syscalls/syscall.def +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def @@ -112,3 +112,7 @@ userfaultfd 282 388 (int flags) fallocate 47 352 (int fd, int mode, loff_t offset, loff_t len) cacheflush ! 983042 (void *start, void *end, int flags) ppoll 73 336 (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +fsopen 430 430 (char *fsname, unsigned int flags) +fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux) +fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags) +clone3 435 435 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/arm/src/lib/handle-elf.c b/compel/arch/arm/src/lib/handle-elf.c index 8abf8dad1d..5b8d00a6f7 100644 --- a/compel/arch/arm/src/lib/handle-elf.c +++ b/compel/arch/arm/src/lib/handle-elf.c @@ -1,6 +1,5 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" diff --git a/compel/arch/arm/src/lib/include/syscall.h b/compel/arch/arm/src/lib/include/syscall.h index e2ec1272ec..30290667af 100644 --- a/compel/arch/arm/src/lib/include/syscall.h +++ b/compel/arch/arm/src/lib/include/syscall.h @@ -1,4 +1,4 @@ #ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) ({ (void)compat; __NR_##syscall; }) #endif diff --git a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h index b8286d4049..69222b251f 100644 --- a/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/arm/src/lib/include/uapi/asm/infect-types.h @@ -61,6 +61,6 @@ struct user_vfp_exc { #define ARCH_SI_TRAP TRAP_BRKPT -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) ({ (void)compat; __NR_##syscall; }) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/arm/src/lib/infect.c b/compel/arch/arm/src/lib/infect.c index c17cb9c9b6..0053bef581 100644 --- a/compel/arch/arm/src/lib/infect.c +++ b/compel/arch/arm/src/lib/infect.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include "common/page.h" diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl index 3b30790402..1afaf1e704 100644 --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl @@ -108,3 +108,7 @@ __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_preadv 320 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_userfaultfd 364 sys_userfaultfd (int flags) __NR_ppoll 281 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/ppc64/src/lib/cpu.c b/compel/arch/ppc64/src/lib/cpu.c index 338ab4891f..7a39727908 100644 --- a/compel/arch/ppc64/src/lib/cpu.c +++ b/compel/arch/ppc64/src/lib/cpu.c @@ -2,6 +2,7 @@ #include #include #include +#include #include "compel-cpu.h" diff --git a/compel/arch/ppc64/src/lib/handle-elf.c b/compel/arch/ppc64/src/lib/handle-elf.c index 3d4020f597..f29fdc8a39 100644 --- a/compel/arch/ppc64/src/lib/handle-elf.c +++ b/compel/arch/ppc64/src/lib/handle-elf.c @@ -1,6 +1,5 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" diff --git a/compel/arch/ppc64/src/lib/include/syscall.h b/compel/arch/ppc64/src/lib/include/syscall.h index e2ec1272ec..30290667af 100644 --- a/compel/arch/ppc64/src/lib/include/syscall.h +++ b/compel/arch/ppc64/src/lib/include/syscall.h @@ -1,4 +1,4 @@ #ifndef __COMPEL_SYSCALL_H__ #define __COMPEL_SYSCALL_H__ -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) ({ (void)compat; __NR_##syscall; }) #endif diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h index 89fc4aa3c4..126fa2ea31 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/infect-types.h @@ -81,6 +81,6 @@ typedef struct { #define ARCH_SI_TRAP TRAP_BRKPT -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) ({ (void)compat; __NR_##syscall; }) #endif /* UAPI_COMPEL_ASM_TYPES_H__ */ diff --git a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h index 9467a1b990..5c98b199de 100644 --- a/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/ppc64/src/lib/include/uapi/asm/sigframe.h @@ -50,7 +50,7 @@ struct rt_sigframe { "sc \n" \ : \ : "r"(new_sp) \ - : "1", "memory") + : "memory") #if _CALL_ELF != 2 # error Only supporting ABIv2. diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl index cc13a63dd5..ae6fdb5f8d 100644 --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl @@ -108,3 +108,7 @@ __NR_userfaultfd 355 sys_userfaultfd (int flags) __NR_preadv 328 sys_preadv_raw (int fd, struct iovec *iov, unsigned long nr, unsigned long pos_l, unsigned long pos_h) __NR_gettimeofday 78 sys_gettimeofday (struct timeval *tv, struct timezone *tz) __NR_ppoll 302 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/s390/src/lib/handle-elf.c b/compel/arch/s390/src/lib/handle-elf.c index 01a8bf4c8b..6ed382c92f 100644 --- a/compel/arch/s390/src/lib/handle-elf.c +++ b/compel/arch/s390/src/lib/handle-elf.c @@ -1,6 +1,5 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" diff --git a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h index fddf65d3b0..8171d33951 100644 --- a/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h +++ b/compel/arch/s390/src/lib/include/uapi/asm/infect-types.h @@ -73,7 +73,7 @@ typedef struct { #define user_regs_native(pregs) true -#define __NR(syscall, compat) __NR_##syscall +#define __NR(syscall, compat) ({ (void)compat; __NR_##syscall; }) struct mmap_arg_struct { unsigned long addr; diff --git a/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h b/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h index b6b8944733..c599ef3ab2 100644 --- a/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/s390/src/lib/include/uapi/asm/sigframe.h @@ -66,7 +66,7 @@ struct rt_sigframe { "svc 0\n" \ : \ : "d" (new_sp) \ - : "15", "memory") + : "memory") #define RT_SIGFRAME_UC(rt_sigframe) (&rt_sigframe->uc) #define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->uc.uc_mcontext.regs.psw.addr diff --git a/compel/arch/s390/src/lib/infect.c b/compel/arch/s390/src/lib/infect.c index 00e9c36d21..5a4675449d 100644 --- a/compel/arch/s390/src/lib/infect.c +++ b/compel/arch/s390/src/lib/infect.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -453,8 +454,10 @@ void *remote_mmap(struct parasite_ctl *ctl, if (ptrace_poke_area(pid, &arg_struct, where, sizeof(arg_struct))) { pr_err("Can't restore mmap args (pid: %d)\n", pid); if (map != 0) { - compel_syscall(ctl, __NR_munmap, NULL, map, + err = compel_syscall(ctl, __NR_munmap, NULL, map, length, 0, 0, 0, 0); + if (err) + pr_err("Can't munmap %d\n", err); map = 0; } } diff --git a/compel/arch/x86/plugins/std/parasite-head.S b/compel/arch/x86/plugins/std/parasite-head.S index a988de9d42..465cd887b1 100644 --- a/compel/arch/x86/plugins/std/parasite-head.S +++ b/compel/arch/x86/plugins/std/parasite-head.S @@ -25,7 +25,9 @@ ENTRY(__export_parasite_head_start_compat) .code64 PARASITE_ENTRY 0 pushq $__USER32_CS - pushq $2f + xor %r11, %r11 + movl $2f, %r11d + pushq %r11 lretq 2: .code32 diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl index 7903ab150a..7a487110d9 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl @@ -96,3 +96,7 @@ __NR_seccomp 354 sys_seccomp (unsigned int op, unsigned int flags, const char __NR_memfd_create 356 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 374 sys_userfaultfd (int flags) __NR_ppoll 309 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl index 4ac9164ea1..6667c07db7 100644 --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl @@ -107,3 +107,7 @@ __NR_kcmp 312 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1 __NR_memfd_create 319 sys_memfd_create (const char *name, unsigned int flags) __NR_userfaultfd 323 sys_userfaultfd (int flags) __NR_ppoll 271 sys_ppoll (struct pollfd *fds, unsigned int nfds, const struct timespec *tmo, const sigset_t *sigmask, size_t sigsetsize) +__NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags) +__NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux) +__NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags) +__NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size) diff --git a/compel/arch/x86/src/lib/handle-elf.c b/compel/arch/x86/src/lib/handle-elf.c index 62fb28f494..938999b2e1 100644 --- a/compel/arch/x86/src/lib/handle-elf.c +++ b/compel/arch/x86/src/lib/handle-elf.c @@ -1,6 +1,5 @@ #include - -#include "uapi/compel.h" +#include #include "handle-elf.h" #include "piegen.h" diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h index 509f4488b3..4ff531fb9c 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h @@ -263,7 +263,7 @@ struct xsave_struct_ia32 { struct ymmh_struct ymmh; uint8_t extended_state_area[EXTENDED_STATE_AREA_SIZE]; }; -} __aligned(FXSAVE_ALIGN_BYTES); +}; typedef struct { /* @@ -309,7 +309,11 @@ typedef struct { typedef struct { union { fpu_state_64_t fpu_state_64; - fpu_state_ia32_t fpu_state_ia32; + struct { + /* fpu_state_ia32->xsave has to be 64-byte aligned. */ + uint32_t __pad[2]; + fpu_state_ia32_t fpu_state_ia32; + }; }; uint8_t has_fpu; diff --git a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h index 51ca023f77..486c0c8e03 100644 --- a/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h +++ b/compel/arch/x86/src/lib/include/uapi/asm/sigframe.h @@ -194,7 +194,9 @@ void rt_sigframe_erase_sigset(struct rt_sigframe *sigframe) #define ARCH_RT_SIGRETURN_COMPAT(new_sp) \ asm volatile( \ "pushq $"__stringify(USER32_CS)" \n" \ - "pushq $1f \n" \ + "xor %%rax, %%rax \n" \ + "movl $1f, %%eax \n" \ + "pushq %%rax \n" \ "lretq \n" \ "1: \n" \ ".code32 \n" \ diff --git a/compel/arch/x86/src/lib/infect.c b/compel/arch/x86/src/lib/infect.c index 11e7f4c91f..9c4abb60c2 100644 --- a/compel/arch/x86/src/lib/infect.c +++ b/compel/arch/x86/src/lib/infect.c @@ -3,6 +3,7 @@ #include #include #include +#include #include diff --git a/compel/include/log.h b/compel/include/log.h index 559f909cea..49e65bb507 100644 --- a/compel/include/log.h +++ b/compel/include/log.h @@ -1,8 +1,7 @@ #ifndef COMPEL_LOG_H__ #define COMPEL_LOG_H__ -#include "uapi/compel/compel.h" -#include "uapi/compel/loglevels.h" +#include "uapi/compel/log.h" #ifndef LOG_PREFIX # define LOG_PREFIX diff --git a/compel/include/uapi/compel.h b/compel/include/uapi/compel.h deleted file mode 100644 index 318a472da9..0000000000 --- a/compel/include/uapi/compel.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef UAPI_COMPEL_H__ -#define UAPI_COMPEL_H__ - -#include -#include - -#include -#include -#include -#include -#include -#include - -#endif /* UAPI_COMPEL_H__ */ diff --git a/compel/include/uapi/cpu.h b/compel/include/uapi/cpu.h index 6f827d4472..72c8a516c2 100644 --- a/compel/include/uapi/cpu.h +++ b/compel/include/uapi/cpu.h @@ -6,7 +6,7 @@ #include -extern int compel_cpuid(compel_cpuinfo_t *info); +extern int /* TODO: __must_check */ compel_cpuid(compel_cpuinfo_t *info); extern bool compel_cpu_has_feature(unsigned int feature); extern bool compel_fpu_has_feature(unsigned int feature); extern uint32_t compel_fpu_feature_size(unsigned int feature); diff --git a/compel/include/uapi/infect-rpc.h b/compel/include/uapi/infect-rpc.h index 0176c11425..180dedf1f6 100644 --- a/compel/include/uapi/infect-rpc.h +++ b/compel/include/uapi/infect-rpc.h @@ -6,9 +6,9 @@ #include struct parasite_ctl; -extern int compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); -extern int compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); -extern int compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); +extern int __must_check compel_rpc_sync(unsigned int cmd, struct parasite_ctl *ctl); +extern int __must_check compel_rpc_call(unsigned int cmd, struct parasite_ctl *ctl); +extern int __must_check compel_rpc_call_sync(unsigned int cmd, struct parasite_ctl *ctl); extern int compel_rpc_sock(struct parasite_ctl *ctl); #define PARASITE_USER_CMDS 64 diff --git a/compel/include/uapi/infect-util.h b/compel/include/uapi/infect-util.h index 7307ba57a0..4e32d13dc4 100644 --- a/compel/include/uapi/infect-util.h +++ b/compel/include/uapi/infect-util.h @@ -1,6 +1,9 @@ #ifndef __COMPEL_INFECT_UTIL_H__ #define __COMPEL_INFECT_UTIL_H__ + +#include "common/compiler.h" + struct parasite_ctl; -extern int compel_util_send_fd(struct parasite_ctl *ctl, int fd); +extern int __must_check compel_util_send_fd(struct parasite_ctl *ctl, int fd); extern int compel_util_recv_fd(struct parasite_ctl *ctl, int *pfd); #endif diff --git a/compel/include/uapi/infect.h b/compel/include/uapi/infect.h index 08beaffcdf..dd672bc1c9 100644 --- a/compel/include/uapi/infect.h +++ b/compel/include/uapi/infect.h @@ -13,7 +13,7 @@ #define PARASITE_START_AREA_MIN (4096) -extern int compel_interrupt_task(int pid); +extern int __must_check compel_interrupt_task(int pid); struct seize_task_status { unsigned long long sigpnd; @@ -23,27 +23,28 @@ struct seize_task_status { int seccomp_mode; }; -extern int compel_wait_task(int pid, int ppid, +extern int __must_check compel_wait_task(int pid, int ppid, int (*get_status)(int pid, struct seize_task_status *, void *data), void (*free_status)(int pid, struct seize_task_status *, void *data), struct seize_task_status *st, void *data); -extern int compel_stop_task(int pid); +extern int __must_check compel_stop_task(int pid); extern int compel_resume_task(pid_t pid, int orig_state, int state); struct parasite_ctl; struct parasite_thread_ctl; -extern struct parasite_ctl *compel_prepare(int pid); -extern struct parasite_ctl *compel_prepare_noctx(int pid); -extern int compel_infect(struct parasite_ctl *ctl, unsigned long nr_threads, unsigned long args_size); -extern struct parasite_thread_ctl *compel_prepare_thread(struct parasite_ctl *ctl, int pid); +extern struct parasite_ctl __must_check *compel_prepare(int pid); +extern struct parasite_ctl __must_check *compel_prepare_noctx(int pid); +extern int __must_check compel_infect(struct parasite_ctl *ctl, + unsigned long nr_threads, unsigned long args_size); +extern struct parasite_thread_ctl __must_check *compel_prepare_thread(struct parasite_ctl *ctl, int pid); extern void compel_release_thread(struct parasite_thread_ctl *); -extern int compel_stop_daemon(struct parasite_ctl *ctl); -extern int compel_cure_remote(struct parasite_ctl *ctl); -extern int compel_cure_local(struct parasite_ctl *ctl); -extern int compel_cure(struct parasite_ctl *ctl); +extern int __must_check compel_stop_daemon(struct parasite_ctl *ctl); +extern int __must_check compel_cure_remote(struct parasite_ctl *ctl); +extern int __must_check compel_cure_local(struct parasite_ctl *ctl); +extern int __must_check compel_cure(struct parasite_ctl *ctl); #define PARASITE_ARG_SIZE_MIN ( 1 << 12) @@ -58,15 +59,16 @@ extern int compel_cure(struct parasite_ctl *ctl); extern void *compel_parasite_args_p(struct parasite_ctl *ctl); extern void *compel_parasite_args_s(struct parasite_ctl *ctl, unsigned long args_size); -extern int compel_syscall(struct parasite_ctl *ctl, int nr, long *ret, +extern int __must_check compel_syscall(struct parasite_ctl *ctl, + int nr, long *ret, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6); -extern int compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); -extern int compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); +extern int __must_check compel_run_in_thread(struct parasite_thread_ctl *tctl, unsigned int cmd); +extern int __must_check compel_run_at(struct parasite_ctl *ctl, unsigned long ip, user_regs_struct_t *ret_regs); /* * The PTRACE_SYSCALL will trap task twice -- on @@ -80,12 +82,13 @@ enum trace_flags { TRACE_EXIT, }; -extern int compel_stop_on_syscall(int tasks, int sys_nr, +extern int __must_check compel_stop_on_syscall(int tasks, int sys_nr, int sys_nr_compat, enum trace_flags trace); -extern int compel_stop_pie(pid_t pid, void *addr, enum trace_flags *tf, bool no_bp); +extern int __must_check compel_stop_pie(pid_t pid, void *addr, + enum trace_flags *tf, bool no_bp); -extern int compel_unmap(struct parasite_ctl *ctl, unsigned long addr); +extern int __must_check compel_unmap(struct parasite_ctl *ctl, unsigned long addr); extern int compel_mode_native(struct parasite_ctl *ctl); @@ -159,7 +162,7 @@ struct parasite_blob_desc { extern struct parasite_blob_desc *compel_parasite_blob_desc(struct parasite_ctl *); -extern int compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); +extern int __must_check compel_get_thread_regs(struct parasite_thread_ctl *, save_regs_t, void *); extern void compel_relocs_apply(void *mem, void *vbase, size_t size, compel_reloc_t *elf_relocs, size_t nr_relocs); diff --git a/compel/include/uapi/ptrace.h b/compel/include/uapi/ptrace.h index 4df00b6e1b..13eed72328 100644 --- a/compel/include/uapi/ptrace.h +++ b/compel/include/uapi/ptrace.h @@ -1,6 +1,7 @@ #ifndef UAPI_COMPEL_PTRACE_H__ #define UAPI_COMPEL_PTRACE_H__ +#include "common/compiler.h" /* * We'd want to include both sys/ptrace.h and linux/ptrace.h, * hoping that most definitions come from either one or another. @@ -75,8 +76,8 @@ typedef struct { extern int ptrace_suspend_seccomp(pid_t pid); -extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); -extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); -extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); +extern int __must_check ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes); +extern int __must_check ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes); +extern int __must_check ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes); #endif /* UAPI_COMPEL_PTRACE_H__ */ diff --git a/compel/include/uapi/sigframe-common.h b/compel/include/uapi/sigframe-common.h index fc93c5480b..177bf4c48a 100644 --- a/compel/include/uapi/sigframe-common.h +++ b/compel/include/uapi/sigframe-common.h @@ -8,6 +8,7 @@ # error "Direct inclusion is forbidden, use instead" #endif +#include "common/compiler.h" #include #include @@ -56,7 +57,7 @@ struct rt_ucontext { unsigned long uc_regspace[128] __attribute__((aligned(8))); }; -extern int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, - struct rt_sigframe *rframe); +extern int __must_check sigreturn_prep_fpu_frame(struct rt_sigframe *frame, + struct rt_sigframe *rframe); #endif /* UAPI_COMPEL_SIGFRAME_COMMON_H__ */ diff --git a/compel/plugins/Makefile b/compel/plugins/Makefile index a326e2a661..197ff1b24e 100644 --- a/compel/plugins/Makefile +++ b/compel/plugins/Makefile @@ -53,11 +53,11 @@ std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/parasite-head.o target += fds fds-lib-y += fds/fds.o -ifeq ($(SRCARCH),x86) +ifeq ($(ARCH),x86) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o endif -ifeq ($(SRCARCH),ppc64) +ifeq ($(ARCH),ppc64) std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcpy.o std-lib-y += ./$(PLUGIN_ARCH_DIR)/std/memcmp.o endif diff --git a/compel/plugins/include/uapi/plugin-fds.h b/compel/plugins/include/uapi/plugin-fds.h index cececb21d9..e995b4b66a 100644 --- a/compel/plugins/include/uapi/plugin-fds.h +++ b/compel/plugins/include/uapi/plugin-fds.h @@ -1,7 +1,7 @@ #ifndef COMPEL_PLUGIN_STD_STD_H__ #define COMPEL_PLUGIN_STD_STD_H__ -extern int fds_send_fd(int fd); +extern int __must_check fds_send_fd(int fd); extern int fds_recv_fd(void); #endif /* COMPEL_PLUGIN_STD_STD_H__ */ diff --git a/compel/plugins/include/uapi/std/infect.h b/compel/plugins/include/uapi/std/infect.h index 800df25095..1e784f8b43 100644 --- a/compel/plugins/include/uapi/std/infect.h +++ b/compel/plugins/include/uapi/std/infect.h @@ -1,14 +1,16 @@ #ifndef COMPEL_PLUGIN_STD_INFECT_H__ #define COMPEL_PLUGIN_STD_INFECT_H__ +#include "common/compiler.h" + extern int parasite_get_rpc_sock(void); -extern int parasite_service(unsigned int cmd, void *args); +extern int __must_check parasite_service(unsigned int cmd, void *args); /* * Must be supplied by user plugins. */ -extern int parasite_daemon_cmd(int cmd, void *args); -extern int parasite_trap_cmd(int cmd, void *args); +extern int __must_check parasite_daemon_cmd(int cmd, void *args); +extern int __must_check parasite_trap_cmd(int cmd, void *args); extern void parasite_cleanup(void); /* diff --git a/compel/plugins/include/uapi/std/log.h b/compel/plugins/include/uapi/std/log.h index f21b6df0d9..91462c85b7 100644 --- a/compel/plugins/include/uapi/std/log.h +++ b/compel/plugins/include/uapi/std/log.h @@ -2,6 +2,7 @@ #define COMPEL_PLUGIN_STD_LOG_H__ #include "compel/loglevels.h" +#include "common/compiler.h" #define STD_LOG_SIMPLE_CHUNK 256 diff --git a/compel/plugins/include/uapi/std/syscall-types.h b/compel/plugins/include/uapi/std/syscall-types.h index 57865e7413..031e773bb6 100644 --- a/compel/plugins/include/uapi/std/syscall-types.h +++ b/compel/plugins/include/uapi/std/syscall-types.h @@ -39,6 +39,7 @@ struct msghdr; struct rusage; struct iocb; struct pollfd; +struct clone_args; typedef unsigned long aio_context_t; diff --git a/compel/src/lib/handle-elf.c b/compel/src/lib/handle-elf.c index ca7c53b711..69d5104b66 100644 --- a/compel/src/lib/handle-elf.c +++ b/compel/src/lib/handle-elf.c @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include @@ -12,8 +12,6 @@ #include #include -#include "uapi/compel.h" - #include "handle-elf.h" #include "piegen.h" #include "log.h" @@ -228,7 +226,7 @@ int __handle_elf(void *mem, size_t size) } pr_out("/* Autogenerated from %s */\n", opts.input_filename); - pr_out("#include \n"); + pr_out("#include \n"); for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) { Elf_Sym *sym = &symbols[i]; diff --git a/compel/src/lib/infect.c b/compel/src/lib/infect.c index f0bcaf334f..3fad85ed3c 100644 --- a/compel/src/lib/infect.c +++ b/compel/src/lib/infect.c @@ -313,6 +313,8 @@ int compel_wait_task(int pid, int ppid, int compel_resume_task(pid_t pid, int orig_st, int st) { + int ret = 0; + pr_debug("\tUnseizing %d into %d\n", pid, st); if (st == COMPEL_TASK_DEAD) { @@ -335,15 +337,17 @@ int compel_resume_task(pid_t pid, int orig_st, int st) */ if (orig_st == COMPEL_TASK_STOPPED) kill(pid, SIGSTOP); - } else + } else { pr_err("Unknown final state %d\n", st); + ret = -1; + } if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) { pr_perror("Unable to detach from %d", pid); return -1; } - return 0; + return ret; } static int gen_parasite_saddr(struct sockaddr_un *saddr, int key) @@ -718,14 +722,25 @@ static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size) return 0; } +static void parasite_memfd_close(struct parasite_ctl *ctl, int fd) +{ + bool compat = !compel_mode_native(ctl); + long ret; + int err; + + err = compel_syscall(ctl, __NR(close, compat), &ret, fd, 0, 0, 0, 0, 0); + if (err || ret) + pr_err("Can't close memfd\n"); +} + static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) { void *where = (void *)ctl->ictx.syscall_ip + BUILTIN_SYSCALL_SIZE; + bool compat_task = !compel_mode_native(ctl); uint8_t orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME; pid_t pid = ctl->rpid; long sret = -ENOSYS; int ret, fd, lfd; - bool __maybe_unused compat_task = !compel_mode_native(ctl); if (ctl->ictx.flags & INFECT_NO_MEMFD) return 1; @@ -741,10 +756,9 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) (unsigned long)where, 0, 0, 0, 0, 0); if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) { - fd = (int)(long)sret; + fd = (int)sret; if (fd >= 0) - compel_syscall(ctl, __NR(close, compat_task), &sret, - fd, 0, 0, 0, 0, 0); + parasite_memfd_close(ctl, fd); pr_err("Can't restore memfd args (pid: %d)\n", pid); return -1; } @@ -752,7 +766,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) if (ret < 0) return ret; - fd = (int)(long)sret; + fd = (int)sret; if (fd == -ENOSYS) return 1; if (fd < 0) { @@ -787,7 +801,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) goto err_curef; } - compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); + parasite_memfd_close(ctl, fd); close(lfd); pr_info("Set up parasite blob using memfd\n"); @@ -796,7 +810,7 @@ static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size) err_curef: close(lfd); err_cure: - compel_syscall(ctl, __NR(close, compat_task), &sret, fd, 0, 0, 0, 0, 0); + parasite_memfd_close(ctl, fd); return -1; } @@ -1293,6 +1307,7 @@ int compel_stop_daemon(struct parasite_ctl *ctl) int compel_cure_remote(struct parasite_ctl *ctl) { long ret; + int err; if (compel_stop_daemon(ctl)) return -1; @@ -1300,9 +1315,12 @@ int compel_cure_remote(struct parasite_ctl *ctl) if (!ctl->remote_map) return 0; - compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, - (unsigned long)ctl->remote_map, ctl->map_length, - 0, 0, 0, 0); + err = compel_syscall(ctl, __NR(munmap, !compel_mode_native(ctl)), &ret, + (unsigned long)ctl->remote_map, ctl->map_length, + 0, 0, 0, 0); + if (err) + return err; + if (ret) { pr_err("munmap for remote map %p, %lu returned %lu\n", ctl->remote_map, ctl->map_length, ret); diff --git a/compel/src/lib/log.c b/compel/src/lib/log.c index d195343e45..c86be02c5f 100644 --- a/compel/src/lib/log.c +++ b/compel/src/lib/log.c @@ -4,11 +4,8 @@ #include #include #include - #include -#include - #include "log.h" static unsigned int current_loglevel = COMPEL_DEFAULT_LOGLEVEL; diff --git a/compel/src/lib/ptrace.c b/compel/src/lib/ptrace.c index 9142bac421..4c3530c853 100644 --- a/compel/src/lib/ptrace.c +++ b/compel/src/lib/ptrace.c @@ -34,46 +34,74 @@ int ptrace_suspend_seccomp(pid_t pid) int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes) { unsigned long w; - if (bytes & (sizeof(long) - 1)) + int old_errno = errno; + + if (bytes & (sizeof(long) - 1)) { + pr_err("Peek request with non-word size %ld\n", bytes); return -1; + } + + errno = 0; for (w = 0; w < bytes / sizeof(long); w++) { unsigned long *d = dst, *a = addr; + d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL); - if (d[w] == -1U && errno) + if (d[w] == -1U && errno) { + pr_perror("PEEKDATA failed"); goto err; + } } + errno = old_errno; return 0; err: - return -2; + return -errno; } int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes) { unsigned long w; - if (bytes & (sizeof(long) - 1)) + + if (bytes & (sizeof(long) - 1)) { + pr_err("Poke request with non-word size %ld\n", bytes); return -1; + } + for (w = 0; w < bytes / sizeof(long); w++) { unsigned long *s = src, *a = addr; - if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) + + if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w])) { + pr_perror("POKEDATA failed"); goto err; + } } return 0; err: - return -2; + return -errno; } /* don't swap big space, it might overflow the stack */ int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes) { void *t = alloca(bytes); + int err; - if (ptrace_peek_area(pid, t, dst, bytes)) - return -1; + err = ptrace_peek_area(pid, t, dst, bytes); + if (err) + return err; - if (ptrace_poke_area(pid, src, dst, bytes)) { - if (ptrace_poke_area(pid, t, dst, bytes)) - return -2; - return -1; + err = ptrace_poke_area(pid, src, dst, bytes); + if (err) { + int err2; + + pr_err("Can't poke %d @ %p from %p sized %ld\n", + pid, dst, src, bytes); + + err2 = ptrace_poke_area(pid, t, dst, bytes); + if (err2) { + pr_err("Can't restore the original data with poke\n"); + return err2; + } + return err; } memcpy(src, t, bytes); diff --git a/compel/src/main.c b/compel/src/main.c index 51bac099fe..36127c357d 100644 --- a/compel/src/main.c +++ b/compel/src/main.c @@ -13,15 +13,13 @@ #include #include -#include "uapi/compel/compel.h" - #include "version.h" #include "piegen.h" #include "log.h" #define CFLAGS_DEFAULT_SET \ "-Wstrict-prototypes " \ - "-fno-stack-protector -nostdlib -fomit-frame-pointer " + "-fno-stack-protector -nostdlib -fomit-frame-pointer -ffreestanding " #define COMPEL_CFLAGS_PIE CFLAGS_DEFAULT_SET "-fpie" #define COMPEL_CFLAGS_NOPIC CFLAGS_DEFAULT_SET "-fno-pic" diff --git a/compel/test/fdspy/spy.c b/compel/test/fdspy/spy.c index 258e3ab75b..1a373b6bb0 100644 --- a/compel/test/fdspy/spy.c +++ b/compel/test/fdspy/spy.c @@ -5,7 +5,6 @@ #include #include -#include #include "parasite.h" #define PARASITE_CMD_GETFD PARASITE_USER_CMDS diff --git a/compel/test/infect/spy.c b/compel/test/infect/spy.c index a5aba73089..b5f8b25593 100644 --- a/compel/test/infect/spy.c +++ b/compel/test/infect/spy.c @@ -3,7 +3,6 @@ #include #include -#include #include "parasite.h" #define PARASITE_CMD_INC PARASITE_USER_CMDS diff --git a/compel/test/rsys/spy.c b/compel/test/rsys/spy.c index f5c999d5a2..98654efcf3 100644 --- a/compel/test/rsys/spy.c +++ b/compel/test/rsys/spy.c @@ -4,8 +4,6 @@ #include #include -#include - static void print_vmsg(unsigned int lvl, const char *fmt, va_list parms) { printf("\tLC%u: ", lvl); diff --git a/coredump/criu_coredump/coredump.py b/coredump/criu_coredump/coredump.py index bc53a77051..68dc16bf2f 100644 --- a/coredump/criu_coredump/coredump.py +++ b/coredump/criu_coredump/coredump.py @@ -645,8 +645,7 @@ def _get_page(self, pid, page_no): ppid = self.pstree[pid]["ppid"] return self._get_page(ppid, page_no) else: - with open(self._imgs_dir + "/" + "pages-" + str(pages_id) + - ".img") as f: + with open(self._imgs_dir + "/pages-%s.img" % pages_id) as f: f.seek(off * PAGESIZE) return f.read(PAGESIZE) diff --git a/criu/Makefile b/criu/Makefile index 4134e5052e..ceb49ce099 100644 --- a/criu/Makefile +++ b/criu/Makefile @@ -2,7 +2,7 @@ # 6a8d90f5fec4 "attr: Allow attribute type 0" WRAPFLAGS += -Wl,--wrap=nla_parse,--wrap=nlmsg_parse -ARCH_DIR := criu/arch/$(SRCARCH) +ARCH_DIR := criu/arch/$(ARCH) PIE_DIR := criu/pie export ARCH_DIR PIE_DIR diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 4588ea5b8a..7ba7137bd6 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -29,6 +29,9 @@ obj-y += files-reg.o obj-y += fsnotify.o obj-y += image-desc.o obj-y += image.o +obj-y += img-remote.o +obj-y += img-proxy.o +obj-y += img-cache.o obj-y += ipc_ns.o obj-y += irmap.o obj-y += kcmp-ids.o @@ -37,6 +40,7 @@ obj-y += libnetlink.o obj-y += log.o obj-y += lsm.o obj-y += mem.o +obj-y += memfd.o obj-y += mount.o obj-y += filesystems.o obj-y += namespaces.o diff --git a/criu/aio.c b/criu/aio.c index 45651f2d3f..6ee65d5f4e 100644 --- a/criu/aio.c +++ b/criu/aio.c @@ -11,7 +11,7 @@ #include "parasite.h" #include "parasite-syscall.h" #include "images/mm.pb-c.h" -#include +#include "compel/infect.h" #define NR_IOEVENTS_IN_NPAGES(npages) ((PAGE_SIZE * (npages) - sizeof(struct aio_ring)) / sizeof(struct io_event)) diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c index f98743a23b..76bd1fea75 100644 --- a/criu/arch/aarch64/crtools.c +++ b/criu/arch/aarch64/crtools.c @@ -19,7 +19,7 @@ #include "util.h" #include "cpu.h" #include "restorer.h" -#include +#include "compel/infect.h" #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src)->e diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h index f502cdcaf6..120fa8fb20 100644 --- a/criu/arch/aarch64/include/asm/restorer.h +++ b/criu/arch/aarch64/include/asm/restorer.h @@ -42,6 +42,68 @@ "r"(&thread_args[i]) \ : "x0", "x1", "x2", "x3", "x8", "memory") +/* + * Based on sysdeps/unix/sysv/linux/aarch64/clone.S + * + * int clone(int (*fn)(void *arg), x0 + * void *child_stack, x1 + * int flags, x2 + * void *arg, x3 + * pid_t *ptid, x4 + * struct user_desc *tls, x5 + * pid_t *ctid); x6 + * + * int clone3(struct clone_args *args, x0 + * size_t size); x1 + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + */ + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* In contrast to the clone() wrapper above this does not put + * the thread function and its arguments on the child stack, + * but uses registers to pass these parameters to the child process. + * Based on the glibc clone() wrapper at + * sysdeps/unix/sysv/linux/aarch64/clone.S. + */ \ + "clone3_emul: \n" \ + /* + * Based on the glibc clone() wrapper, which uses x10 and x11 + * to save the arguments for the child process, this does the same. + * x10 for the thread function and x11 for the thread arguments. + */ \ + "mov x10, %3 /* clone_restore_fn */ \n" \ + "mov x11, %4 /* args */ \n" \ + "mov x0, %1 /* &clone_args */ \n" \ + "mov x1, %2 /* size */ \n" \ + /* Load syscall number */ \ + "mov x8, #"__stringify(__NR_clone3)" \n" \ + /* Do the syscall */ \ + "svc #0 \n" \ + \ + "cbz x0, clone3_thread_run \n" \ + \ + "mov %0, x0 \n" \ + "b clone3_end \n" \ + \ + "clone3_thread_run: \n" \ + /* Move args to x0 */ \ + "mov x0, x11 \n" \ + /* Jump to clone_restore_fn */ \ + "br x10 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "x0", "x1", "x8", "x10", "x11", "memory") + #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "mov sp, %0 \n" \ diff --git a/criu/arch/arm/crtools.c b/criu/arch/arm/crtools.c index c216cdc5c0..840d489a65 100644 --- a/criu/arch/arm/crtools.c +++ b/criu/arch/arm/crtools.c @@ -18,8 +18,7 @@ #include "elf.h" #include "parasite-syscall.h" #include "restorer.h" - -#include +#include "compel/infect.h" #define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))((src)->ARM_##e) diff --git a/criu/arch/arm/include/asm/restore.h b/criu/arch/arm/include/asm/restore.h index 4c64d58ef8..c3b64c5b7e 100644 --- a/criu/arch/arm/include/asm/restore.h +++ b/criu/arch/arm/include/asm/restore.h @@ -16,7 +16,7 @@ : "r"(new_sp), \ "r"(restore_task_exec_start), \ "r"(task_args) \ - : "sp", "r0", "r1", "memory") + : "r0", "r1", "memory") static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls) { diff --git a/criu/arch/arm/include/asm/restorer.h b/criu/arch/arm/include/asm/restorer.h index 217d920e84..13ed15b263 100644 --- a/criu/arch/arm/include/asm/restorer.h +++ b/criu/arch/arm/include/asm/restorer.h @@ -43,6 +43,63 @@ "r"(&thread_args[i]) \ : "r0", "r1", "r2", "r3", "r7", "memory") + +/* + * The clone3() assembler wrapper is based on the clone() wrapper above + * and on code from the glibc wrapper at + * sysdeps/unix/sysv/linux/arm/clone.S + * + * For arm it is necessary to change the child stack as on x86_64 as + * it seems there are not registers which stay the same over a syscall + * like on s390x, ppc64le and aarch64. + * + * Changing the child stack means that this code has to deal with the + * kernel doing stack + stack_size implicitly. + * + * int clone3(struct clone_args *args, size_t size) + */ + +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + /* Load thread stack pointer */ \ + "ldr r1, [%3] \n" \ + /* Load thread stack size */ \ + "mov r2, %4 \n" \ + /* Goto to the end of stack */ \ + "add r1, r1, r2 \n" \ + /* Load thread function and arguments and push on stack */ \ + "mov r2, %6 /* args */ \n" \ + "str r2, [r1, #4] /* args */ \n" \ + "mov r2, %5 /* function */ \n" \ + "str r2, [r1] /* function */ \n" \ + "mov r0, %1 /* clone_args */ \n" \ + "mov r1, %2 /* size */ \n" \ + "mov r7, #"__stringify(__NR_clone3)" \n" \ + "svc #0 \n" \ + \ + "cmp r0, #0 \n" \ + "beq thread3_run \n" \ + \ + "mov %0, r0 \n" \ + "b clone3_end \n" \ + \ + "thread3_run: \n" \ + "pop { r1 } \n" \ + "pop { r0 } \n" \ + "bx r1 \n" \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(&clone_args.stack), \ + "r"(clone_args.stack_size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "r0", "r1", "r2", "r7", "memory") + #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "mov sp, %0 \n" \ diff --git a/criu/arch/ppc64/crtools.c b/criu/arch/ppc64/crtools.c index 5a5966ad48..0d9f49c3fe 100644 --- a/criu/arch/ppc64/crtools.c +++ b/criu/arch/ppc64/crtools.c @@ -17,7 +17,7 @@ #include "log.h" #include "util.h" #include "cpu.h" -#include +#include "compel/infect.h" #include "protobuf.h" #include "images/core.pb-c.h" diff --git a/criu/arch/ppc64/include/asm/restore.h b/criu/arch/ppc64/include/asm/restore.h index 8d4516090c..f065ec3a0c 100644 --- a/criu/arch/ppc64/include/asm/restore.h +++ b/criu/arch/ppc64/include/asm/restore.h @@ -21,7 +21,7 @@ : "r"(new_sp), \ "r"((unsigned long)restore_task_exec_start), \ "r"(task_args) \ - : "1", "3", "12") + : "3", "12") /* There is nothing to do since TLS is accessed through r13 */ #define core_get_tls(pcore, ptls) diff --git a/criu/arch/ppc64/include/asm/restorer.h b/criu/arch/ppc64/include/asm/restorer.h index d48d833d6b..c447eefeaa 100644 --- a/criu/arch/ppc64/include/asm/restorer.h +++ b/criu/arch/ppc64/include/asm/restorer.h @@ -48,6 +48,47 @@ "r"(&thread_args[i]) /* %6 */ \ : "memory","0","3","4","5","6","7","14","15") +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ +/* + * The clone3() function accepts following parameters: + * int clone3(struct clone_args *args, size_t size) + * + * Always consult the CLONE3 wrappers for other architectures + * for additional details. + * + * For PPC64LE the first parameter (clone_args) is passed in r3 and + * the second parameter (size) is passed in r4. + * + * This clone3() wrapper is based on the clone() wrapper from above. + */ \ + asm volatile( \ + "clone3_emul: \n" \ + "/* Save fn, args across syscall. */ \n" \ + "mr 14, %3 /* clone_restore_fn in r14 */ \n" \ + "mr 15, %4 /* &thread_args[i] in r15 */ \n" \ + "mr 3, %1 /* clone_args */ \n" \ + "mr 4, %2 /* size */ \n" \ + "li 0,"__stringify(__NR_clone3)" \n" \ + "sc \n" \ + "/* Check for child process. */ \n" \ + "cmpdi cr1,3,0 \n" \ + "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \ + "bne- cr1,clone3_end \n" \ + "/* child */ \n" \ + "addi 14, 14, 8 /* jump over r2 fixup */ \n" \ + "mtctr 14 \n" \ + "mr 3,15 \n" \ + "bctr \n" \ + "clone3_end: \n" \ + "mr %0,3 \n" \ + : "=r"(ret) /* %0 */ \ + : "r"(&clone_args), /* %1 */ \ + "r"(size), /* %2 */ \ + "r"(clone_restore_fn), /* %3 */ \ + "r"(args) /* %4 */ \ + : "memory","0","3","4","5","14","15") + #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r); diff --git a/criu/arch/s390/crtools.c b/criu/arch/s390/crtools.c index 238035b763..000b7779f8 100644 --- a/criu/arch/s390/crtools.c +++ b/criu/arch/s390/crtools.c @@ -17,7 +17,7 @@ #include "log.h" #include "util.h" #include "cpu.h" -#include +#include "compel/infect.h" #include "protobuf.h" #include "images/core.pb-c.h" diff --git a/criu/arch/s390/include/asm/restore.h b/criu/arch/s390/include/asm/restore.h index 6463d8e628..b77e36c771 100644 --- a/criu/arch/s390/include/asm/restore.h +++ b/criu/arch/s390/include/asm/restore.h @@ -18,7 +18,7 @@ : "d" (new_sp), \ "d"((unsigned long)restore_task_exec_start), \ "d" (task_args) \ - : "2", "14", "15", "memory") + : "2", "14", "memory") /* There is nothing to do since TLS is accessed through %a01 */ #define core_get_tls(pcore, ptls) diff --git a/criu/arch/s390/include/asm/restorer.h b/criu/arch/s390/include/asm/restorer.h index cfdefcab9b..2fc2665354 100644 --- a/criu/arch/s390/include/asm/restorer.h +++ b/criu/arch/s390/include/asm/restorer.h @@ -39,6 +39,43 @@ "d"(&thread_args[i]) \ : "0", "1", "2", "3", "4", "5", "6", "cc", "memory") +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + /* + * clone3 only needs two arguments (r2, r3), this means + * we can use r4 and r5 for args and thread function. + * r4 and r5 are callee-saved and are not overwritten. + * No need to put these values on the child stack. + */ \ + "lgr %%r4,%4\n" /* Save args in %r4 */ \ + "lgr %%r5,%3\n" /* Save clone_restore_fn in %r5 */ \ + "lgr %%r2,%1\n" /* Parameter 1: clone_args */ \ + "lgr %%r3,%2\n" /* Parameter 2: size */ \ + /* + * On s390x a syscall is done sc . + * That only works for syscalls < 255. clone3 is 435, + * therefore it is necessary to load the syscall number + * into r1 and do 'svc 0'. + */ \ + "lghi %%r1,"__stringify(__NR_clone3)"\n" \ + "svc 0\n" \ + "ltgr %0,%%r2\n" /* Set and check "ret" */ \ + "jnz 0f\n" /* ret != 0: Continue caller */ \ + "lgr %%r2,%%r4\n" /* Thread arguments taken from r4. */ \ + "lgr %%r1,%%r5\n" /* Thread function taken from r5. */ \ + "aghi %%r15,-160\n" /* Prepare stack frame */ \ + "xc 0(8,%%r15),0(%%r15)\n" \ + "basr %%r14,%%r1\n" /* Jump to clone_restore_fn() */ \ + "j .+2\n" /* BUG(): Force PGM check */ \ +"0:\n" /* Continue caller */ \ + : "=d"(ret) \ + : "a"(&clone_args), \ + "d"(size), \ + "d"(clone_restore_fn), \ + "d"(args) \ + : "0", "1", "2", "3", "4", "5", "cc", "memory") + #define arch_map_vdso(map, compat) -1 int restore_gpregs(struct rt_sigframe *f, UserS390RegsEntry *r); diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c index 3808b9d33b..72c5bd59ca 100644 --- a/criu/arch/x86/cpu.c +++ b/criu/arch/x86/cpu.c @@ -236,6 +236,7 @@ static int cpu_validate_features(compel_cpuinfo_t *cpu_info) return -1; if (opts.cpu_cap & CPU_CAP_FPU) { + uint64_t m; /* * If we're requested to check FPU only ignore * any other bit. It's up to a user if the @@ -261,24 +262,33 @@ static int cpu_validate_features(compel_cpuinfo_t *cpu_info) #undef __mismatch_fpu_bit /* - * Make sure the xsave features are compatible. We already hit the - * issue with libc where we've checkpointed the container on old - * machine but restored on more modern one and libc fetched new - * xsave frame size directly by xsave instruction with greedy - * feature mask causing programs to misbehave. + * Make sure the xsave features are compatible. Check that on + * the destination there are all the features which were on the + * source. */ - if (cpu_info->xfeatures_mask > rt_cpu_info.xfeatures_mask) { - uint64_t m = cpu_info->xfeatures_mask & ~rt_cpu_info.xfeatures_mask; - pr_err("CPU xfeatures has unsupported bits (%#llx)\n", - (unsigned long long)m); + if ((m = cpu_info->xfeatures_mask & + ~rt_cpu_info.xfeatures_mask)) { + pr_err("CPU xfeatures has unsupported bits (%#" + PRIx64")\n", m); return -1; - } else if (cpu_info->xsave_size != rt_cpu_info.xsave_size) { + } + + /* + * Make sure the xsave sizes are compatible. We already hit the + * issue with libc where we've checkpointed the container on + * old machine but restored on more modern one and libc fetched + * new xsave frame size directly by xsave instruction with + * greedy feature mask causing programs to misbehave. + */ + if (cpu_info->xsave_size != rt_cpu_info.xsave_size) { pr_err("CPU xsave size mismatch (%u/%u)\n", cpu_info->xsave_size, rt_cpu_info.xsave_size); return -1; - } else if (cpu_info->xsave_size_max != rt_cpu_info.xsave_size_max) { + } + if (cpu_info->xsave_size_max != rt_cpu_info.xsave_size_max) { pr_err("CPU xsave max size mismatch (%u/%u)\n", - cpu_info->xsave_size_max, rt_cpu_info.xsave_size_max); + cpu_info->xsave_size_max, + rt_cpu_info.xsave_size_max); return -1; } } diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c index efc23e5fea..9c8beeeddd 100644 --- a/criu/arch/x86/crtools.c +++ b/criu/arch/x86/crtools.c @@ -1,5 +1,5 @@ #include "compel/asm/fpu.h" -#include "compel/compel.h" +#include "compel/infect.h" #include "compel/plugins/std/syscall-codes.h" #include "cpu.h" #include "cr_options.h" @@ -590,8 +590,7 @@ static int get_robust_list32(pid_t pid, uintptr_t head, uintptr_t len) .arg2 = (uint32_t)len, }; - do_full_int80(&s); - return (int)s.nr; + return do_full_int80(&s); } static int set_robust_list32(uint32_t head, uint32_t len) @@ -602,8 +601,7 @@ static int set_robust_list32(uint32_t head, uint32_t len) .arg1 = len, }; - do_full_int80(&s); - return (int)s.nr; + return do_full_int80(&s); } int get_task_futex_robust_list_compat(pid_t pid, ThreadCoreEntry *info) diff --git a/criu/arch/x86/include/asm/compat.h b/criu/arch/x86/include/asm/compat.h index cd1ae472d7..acd552fb38 100644 --- a/criu/arch/x86/include/asm/compat.h +++ b/criu/arch/x86/include/asm/compat.h @@ -38,26 +38,45 @@ struct syscall_args32 { uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5; }; -static inline void do_full_int80(struct syscall_args32 *args) +static inline uint32_t do_full_int80(struct syscall_args32 *args) { /* - * r8-r11 registers are cleared during returning to userspace - * from syscall - that's x86_64 ABI to avoid leaking kernel - * pointers. + * Kernel older than v4.4 do not preserve r8-r15 registers when + * invoking int80, so we need to preserve them. * - * Other than that - we can't use %rbp in clobbers as GCC's inline - * assembly doesn't allow to do so. So, here is explicitly saving - * %rbp before syscall and restoring it's value afterward. + * Additionally, %rbp is used as the 6th syscall argument, and we need + * to preserve its value when returning from the syscall to avoid + * upsetting GCC. However, we can't use %rbp in the GCC asm clobbers + * due to a GCC limitation. Instead, we explicitly save %rbp on the + * stack before invoking the syscall and restore its value afterward. + * + * Further, GCC may not adjust the %rsp pointer when allocating the + * args and ret variables because 1) do_full_int80() is a leaf + * function, and 2) the local variables (args and ret) are in the + * 128-byte red-zone as defined in the x86_64 ABI. To use the stack + * when preserving %rbp, we must either tell GCC to a) mark the + * function as non-leaf, or b) move away from the red-zone when using + * the stack. It seems that there is no easy way to do a), so we'll go + * with b). + * Note 1: Another workaround would have been to add %rsp in the list + * of clobbers, but this was deprecated in GCC 9. + * Note 2: This red-zone bug only manifests when compiling CRIU with + * DEBUG=1. */ - asm volatile ("pushq %%rbp\n\t" - "mov %6, %%ebp\n\t" - "int $0x80\n\t" - "mov %%ebp, %6\n\t" - "popq %%rbp\n\t" - : "+a" (args->nr), - "+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2), - "+S" (args->arg3), "+D" (args->arg4), "+g" (args->arg5) - : : "r8", "r9", "r10", "r11"); + uint32_t ret; + + asm volatile ("sub $128, %%rsp\n\t" + "pushq %%rbp\n\t" + "mov %7, %%ebp\n\t" + "int $0x80\n\t" + "popq %%rbp\n\t" + "add $128, %%rsp\n\t" + : "=a" (ret) + : "a" (args->nr), + "b" (args->arg0), "c" (args->arg1), "d" (args->arg2), + "S" (args->arg3), "D" (args->arg4), "g" (args->arg5) + : "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"); + return ret; } #ifndef CR_NOGLIBC diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index 25559b57c0..731477ec99 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -25,6 +25,21 @@ static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) } #endif /* !CONFIG_COMPAT */ +/* + * Documentation copied from glibc sysdeps/unix/sysv/linux/x86_64/clone.S + * The kernel expects: + * rax: system call number + * rdi: flags + * rsi: child_stack + * rdx: TID field in parent + * r10: TID field in child + * r8: thread pointer + * + * int clone(unsigned long clone_flags, unsigned long newsp, + * int *parent_tidptr, int *child_tidptr, + * unsigned long tls); + */ + #define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \ thread_args, clone_restore_fn) \ asm volatile( \ @@ -63,6 +78,83 @@ static inline int set_compat_robust_list(uint32_t head_ptr, uint32_t len) "g"(&thread_args[i]) \ : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory") +/* int clone3(struct clone_args *args, size_t size) */ +#define RUN_CLONE3_RESTORE_FN(ret, clone_args, size, args, \ + clone_restore_fn) \ + asm volatile( \ + "clone3_emul: \n" \ + /* + * Prepare stack pointer for child process. The kernel does + * stack + stack_size before passing the stack pointer to the + * child process. As we have to put the function and the + * arguments for the new process on that stack we have handle + * the kernel's implicit stack + stack_size. + */ \ + "movq (%3), %%rsi /* new stack pointer */ \n" \ + /* Move the stack_size to %rax to use later as the offset */ \ + "movq %4, %%rax \n" \ + /* 16 bytes are needed on the stack for function and args */ \ + "subq $16, (%%rsi, %%rax) \n" \ + "movq %6, %%rdi /* thread args */ \n" \ + "movq %%rdi, 8(%%rsi, %%rax) \n" \ + "movq %5, %%rdi /* thread function */ \n" \ + "movq %%rdi, 0(%%rsi, %%rax) \n" \ + /* + * The stack address has been modified for the two + * elements above (child function, child arguments). + * This modified stack needs to be stored back into the + * clone_args structure. + */ \ + "movq (%%rsi), %3 \n" \ + /* + * Do the actual clone3() syscall. First argument (%rdi) is + * the clone_args structure, second argument is the size + * of clone_args. + */ \ + "movq %1, %%rdi /* clone_args */ \n" \ + "movq %2, %%rsi /* size */ \n" \ + "movl $"__stringify(__NR_clone3)", %%eax \n" \ + "syscall \n" \ + /* + * If clone3() was successful and if we are in the child + * '0' is returned. Jump to the child function handler. + */ \ + "testq %%rax,%%rax \n" \ + "jz thread3_run \n" \ + /* Return the PID to the parent process. */ \ + "movq %%rax, %0 \n" \ + "jmp clone3_end \n" \ + \ + "thread3_run: /* Child process */ \n" \ + /* Clear the frame pointer */ \ + "xorq %%rbp, %%rbp \n" \ + /* Pop the child function from the stack */ \ + "popq %%rax \n" \ + /* Pop the child function arguments from the stack */ \ + "popq %%rdi \n" \ + /* Run the child function */ \ + "callq *%%rax \n" \ + /* + * If the child function is expected to return, this + * would be the place to handle the return code. In CRIU's + * case the child function is expected to not return + * and do exit() itself. + */ \ + \ + "clone3_end: \n" \ + : "=r"(ret) \ + /* + * This uses the "r" modifier for all parameters + * as clang complained if using "g". + */ \ + : "r"(&clone_args), \ + "r"(size), \ + "r"(&clone_args.stack), \ + "r"(clone_args.stack_size), \ + "r"(clone_restore_fn), \ + "r"(args) \ + : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory") + #define ARCH_FAIL_CORE_RESTORE \ asm volatile( \ "movq %0, %%rsp \n" \ diff --git a/criu/arch/x86/kerndat.c b/criu/arch/x86/kerndat.c index f7593251b2..94c954e1e4 100644 --- a/criu/arch/x86/kerndat.c +++ b/criu/arch/x86/kerndat.c @@ -75,9 +75,7 @@ void *mmap_ia32(void *addr, size_t len, int prot, s.arg4 = fildes; s.arg5 = (uint32_t)off; - do_full_int80(&s); - - return (void *)(uintptr_t)s.nr; + return (void *)(uintptr_t)do_full_int80(&s); } /* diff --git a/criu/arch/x86/restorer.c b/criu/arch/x86/restorer.c index 2d335d5e1d..b2c3b3668a 100644 --- a/criu/arch/x86/restorer.c +++ b/criu/arch/x86/restorer.c @@ -54,8 +54,7 @@ int set_compat_robust_list(uint32_t head_ptr, uint32_t len) .arg1 = len, }; - do_full_int80(&s); - return (int)s.nr; + return do_full_int80(&s); } static int prepare_stack32(void **stack32) diff --git a/criu/arch/x86/sigaction_compat.c b/criu/arch/x86/sigaction_compat.c index b38ba80118..f467da490e 100644 --- a/criu/arch/x86/sigaction_compat.c +++ b/criu/arch/x86/sigaction_compat.c @@ -28,7 +28,6 @@ extern char restore_rt_sigaction; */ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) { - int ret; struct syscall_args32 arg = {}; unsigned long act_stack = (unsigned long)stack32; @@ -49,8 +48,5 @@ int arch_compat_rt_sigaction(void *stack32, int sig, rt_sigaction_t_compat *act) arg.arg2 = 0; /* oldact */ arg.arg3 = (uint32_t)sizeof(act->rt_sa_mask); /* sigsetsize */ - do_full_int80(&arg); - asm volatile ("\t movl %%eax,%0\n" : "=r"(ret)); - return ret; + return do_full_int80(&arg); } - diff --git a/criu/arch/x86/sigframe.c b/criu/arch/x86/sigframe.c index 11b0d640de..33ba14387f 100644 --- a/criu/arch/x86/sigframe.c +++ b/criu/arch/x86/sigframe.c @@ -28,8 +28,14 @@ int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, sigframe->native.uc.uc_mcontext.fpstate = (uint64_t)addr; } else if (!sigframe->is_native) { + unsigned long addr = (unsigned long)(void *)&fpu_state->fpu_state_ia32.xsave; sigframe->compat.uc.uc_mcontext.fpstate = (uint32_t)(unsigned long)(void *)&fpu_state->fpu_state_ia32; + if ((addr % 64ul)) { + pr_err("Unaligned address passed: %lx (native %d)\n", + addr, sigframe->is_native); + return -1; + } } return 0; diff --git a/criu/arch/x86/sys-exec-tbl.c b/criu/arch/x86/sys-exec-tbl.c index 608dc2510d..225b8a1535 100644 --- a/criu/arch/x86/sys-exec-tbl.c +++ b/criu/arch/x86/sys-exec-tbl.c @@ -1,4 +1,3 @@ -#include static struct syscall_exec_desc sc_exec_table_64[] = { #include "sys-exec-tbl-64.c" diff --git a/criu/cgroup.c b/criu/cgroup.c index 332c79fb9a..d4c7121673 100644 --- a/criu/cgroup.c +++ b/criu/cgroup.c @@ -8,6 +8,7 @@ #include #include #include + #include "common/list.h" #include "xmalloc.h" #include "cgroup.h" @@ -24,6 +25,8 @@ #include "protobuf.h" #include "images/core.pb-c.h" #include "images/cgroup.pb-c.h" +#include "kerndat.h" +#include "linux/mount.h" /* * This structure describes set of controller groups @@ -542,6 +545,84 @@ static int add_freezer_state(struct cg_controller *controller) return 0; } +static const char namestr[] = "name="; +static int __new_open_cgroupfs(struct cg_ctl *cc) +{ + int fsfd, fd; + char *name; + + fsfd = sys_fsopen("cgroup", 0); + if (fsfd < 0) { + pr_perror("Unable to open the cgroup file system"); + return -1; + } + + if (strstartswith(cc->name, namestr)) { + if (sys_fsconfig(fsfd, FSCONFIG_SET_STRING, + "name", cc->name + strlen(namestr), 0)) { + pr_perror("Unable to configure the cgroup (%s) file system", cc->name); + goto err; + } + } else { + char *saveptr = NULL, *buf = strdupa(cc->name); + name = strtok_r(buf, ",", &saveptr); + while (name) { + if (sys_fsconfig(fsfd, FSCONFIG_SET_FLAG, name, NULL, 0)) { + pr_perror("Unable to configure the cgroup (%s) file system", name); + goto err; + } + name = strtok_r(NULL, ",", &saveptr); + } + } + + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)) { + pr_perror("Unable to create the cgroup (%s) file system", cc->name); + goto err; + } + + fd = sys_fsmount(fsfd, 0, 0); + if (fd < 0) + pr_perror("Unable to mount the cgroup (%s) file system", cc->name); + close(fsfd); + + return fd; +err: + close(fsfd); + return -1; +} + +static int open_cgroupfs(struct cg_ctl *cc) +{ + char prefix[] = ".criu.cgmounts.XXXXXX"; + char mopts[1024]; + int fd; + + if (kdat.has_fsopen) + return __new_open_cgroupfs(cc); + + if (strstartswith(cc->name, namestr)) + snprintf(mopts, sizeof(mopts), "none,%s", cc->name); + else + snprintf(mopts, sizeof(mopts), "%s", cc->name); + + if (mkdtemp(prefix) == NULL) { + pr_perror("can't make dir for cg mounts"); + return -1; + } + + if (mount("none", prefix, "cgroup", 0, mopts) < 0) { + pr_perror("Unable to mount %s", mopts); + rmdir(prefix); + return -1; + } + + fd = open_detach_mount(prefix); + if (fd < 0) + return -1; + + return fd; +} + static int collect_cgroups(struct list_head *ctls) { struct cg_ctl *cc; @@ -549,8 +630,7 @@ static int collect_cgroups(struct list_head *ctls) int fd = -1; list_for_each_entry(cc, ctls, l) { - char path[PATH_MAX], mopts[1024], *root; - char prefix[] = ".criu.cgmounts.XXXXXX"; + char path[PATH_MAX], *root; struct cg_controller *cg; struct cg_root_opt *o; @@ -568,7 +648,7 @@ static int collect_cgroups(struct list_head *ctls) if (!current_controller) { /* only allow "fake" controllers to be created this way */ - if (!strstartswith(cc->name, "name=")) { + if (!strstartswith(cc->name, namestr)) { pr_err("controller %s not found\n", cc->name); return -1; } else { @@ -586,26 +666,25 @@ static int collect_cgroups(struct list_head *ctls) if (!opts.manage_cgroups) continue; - if (strstartswith(cc->name, "name=")) - snprintf(mopts, sizeof(mopts), "none,%s", cc->name); - else - snprintf(mopts, sizeof(mopts), "%s", cc->name); + if (opts.cgroup_yard) { + char dir_path[PATH_MAX]; + int off; - if (mkdtemp(prefix) == NULL) { - pr_perror("can't make dir for cg mounts"); - return -1; - } + off = snprintf(dir_path, PATH_MAX, "%s/", opts.cgroup_yard); + if (strstartswith(cc->name, namestr)) + snprintf(dir_path + off, PATH_MAX - off, "%s", cc->name + strlen(namestr)); + else + snprintf(dir_path + off, PATH_MAX - off, "%s", cc->name); - if (mount("none", prefix, "cgroup", 0, mopts) < 0) { - pr_perror("couldn't mount %s", mopts); - rmdir(prefix); - return -1; + fd = open(dir_path, O_RDONLY | O_DIRECTORY, 0); + if (fd < 0) { + pr_perror("couldn't open %s", dir_path); + return -1; + } + } else { + fd = open_cgroupfs(cc); } - fd = open_detach_mount(prefix); - if (fd < 0) - return -1; - path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd); root = cc->path; @@ -620,6 +699,7 @@ static int collect_cgroups(struct list_head *ctls) snprintf(path + path_pref_len, PATH_MAX - path_pref_len, "%s", root); ret = ftw(path, add_cgroup, 4); + if (ret < 0) pr_perror("failed walking %s for empty cgroups", path); @@ -1167,10 +1247,12 @@ void fini_cgroup(void) return; close_service_fd(CGROUP_YARD); - if (umount2(cg_yard, MNT_DETACH)) - pr_perror("Unable to umount %s", cg_yard); - if (rmdir(cg_yard)) - pr_perror("Unable to remove %s", cg_yard); + if (!opts.cgroup_yard) { + if (umount2(cg_yard, MNT_DETACH)) + pr_perror("Unable to umount %s", cg_yard); + if (rmdir(cg_yard)) + pr_perror("Unable to remove %s", cg_yard); + } xfree(cg_yard); cg_yard = NULL; } @@ -1652,20 +1734,28 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) pr_info("Preparing cgroups yard (cgroups restore mode %#x)\n", opts.manage_cgroups); - off = sprintf(paux, ".criu.cgyard.XXXXXX"); - if (mkdtemp(paux) == NULL) { - pr_perror("Can't make temp cgyard dir"); - return -1; - } + if (opts.cgroup_yard) { + off = sprintf(paux, "%s", opts.cgroup_yard); - cg_yard = xstrdup(paux); - if (!cg_yard) { - rmdir(paux); - return -1; - } + cg_yard = xstrdup(paux); + if (!cg_yard) + return -1; + } else { + off = sprintf(paux, ".criu.cgyard.XXXXXX"); + if (mkdtemp(paux) == NULL) { + pr_perror("Can't make temp cgyard dir"); + return -1; + } - if (make_yard(cg_yard)) - goto err; + cg_yard = xstrdup(paux); + if (!cg_yard) { + rmdir(paux); + return -1; + } + + if (make_yard(cg_yard)) + goto err; + } pr_debug("Opening %s as cg yard\n", cg_yard); i = open(cg_yard, O_DIRECTORY); @@ -1699,11 +1789,11 @@ static int prepare_cgroup_sfd(CgroupEntry *ce) pr_debug("\tMaking controller dir %s (%s)\n", paux, opt); if (mkdir(paux, 0700)) { pr_perror("\tCan't make controller dir %s", paux); - return -1; + goto err; } if (mount("none", paux, "cgroup", 0, opt) < 0) { pr_perror("\tCan't mount controller dir %s", paux); - return -1; + goto err; } } diff --git a/criu/clone-noasan.c b/criu/clone-noasan.c index 5ca280eb83..a2190ba0af 100644 --- a/criu/clone-noasan.c +++ b/criu/clone-noasan.c @@ -1,4 +1,10 @@ +#include #include +#include + +#include + +#include "sched.h" #include "common/compiler.h" #include "log.h" #include "common/bug.h" @@ -18,10 +24,20 @@ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69863 * * So the only way is to put this wrapper in separate non-instrumented file + * + * WARNING: When calling clone_noasan make sure your not sitting in a later + * __restore__ phase where other tasks might be creating threads, otherwise + * all calls to clone_noasan should be guarder with + * + * lock_last_pid + * clone_noasan + * ... wait for process to finish ... + * unlock_last_pid */ int clone_noasan(int (*fn)(void *), int flags, void *arg) { void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16); + BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK)); /* * Reserve some bytes for clone() internal needs @@ -29,3 +45,40 @@ int clone_noasan(int (*fn)(void *), int flags, void *arg) */ return clone(fn, stack_ptr, flags, arg); } + +int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, + int exit_signal, pid_t pid) +{ + struct _clone_args c_args = {}; + + BUG_ON(flags & CLONE_VM); + + /* + * Make sure no child signals are requested. clone3() uses + * exit_signal for that. + */ + BUG_ON(flags & 0xff); + + pr_debug("Creating process using clone3()\n"); + + /* + * clone3() explicitly blocks setting an exit_signal + * if CLONE_PARENT is specified. With clone() it also + * did not work, but there was no error message. The + * exit signal from the thread group leader is taken. + */ + if (!(flags & CLONE_PARENT)) { + if (exit_signal != SIGCHLD) { + pr_err("Exit signal not SIGCHLD\n"); + return -1; + } + c_args.exit_signal = exit_signal; + } + c_args.flags = flags; + c_args.set_tid = ptr_to_u64(&pid); + c_args.set_tid_size = 1; + pid = syscall(__NR_clone3, &c_args, sizeof(c_args)); + if (pid == 0) + exit(fn(arg)); + return pid; +} diff --git a/criu/config.c b/criu/config.c index 39aa071c99..df5d851626 100644 --- a/criu/config.c +++ b/criu/config.c @@ -30,6 +30,7 @@ #include "common/xmalloc.h" struct cr_options opts; +char *rpc_cfg_file; static int count_elements(char **to_count) { @@ -276,6 +277,7 @@ void init_opts(void) opts.empty_ns = 0; opts.status_fd = -1; opts.log_level = DEFAULT_LOGLEVEL; + opts.pre_dump_mode = PRE_DUMP_SPLICE; } bool deprecated_ok(char *what) @@ -508,6 +510,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT(SK_CLOSE_PARAM, &opts.tcp_close), { "verbosity", optional_argument, 0, 'v' }, { "ps-socket", required_argument, 0, 1091}, + BOOL_OPT("remote", &opts.remote), { "config", required_argument, 0, 1089}, { "no-default-config", no_argument, 0, 1090}, { "tls-cacert", required_argument, 0, 1092}, @@ -516,6 +519,8 @@ int parse_options(int argc, char **argv, bool *usage_error, { "tls-key", required_argument, 0, 1095}, BOOL_OPT("tls", &opts.tls), {"tls-no-cn-verify", no_argument, &opts.tls_no_cn_verify, true}, + { "cgroup-yard", required_argument, 0, 1096 }, + { "pre-dump-mode", required_argument, 0, 1097}, { }, }; @@ -814,6 +819,17 @@ int parse_options(int argc, char **argv, bool *usage_error, case 1095: SET_CHAR_OPTS(tls_key, optarg); break; + case 1096: + SET_CHAR_OPTS(cgroup_yard, optarg); + break; + case 1097: + if (!strcmp("read", optarg)) { + opts.pre_dump_mode = PRE_DUMP_READ; + } else if (strcmp("splice", optarg)) { + pr_err("Unable to parse value of --pre-dump-mode\n"); + return 1; + } + break; case 'V': pr_msg("Version: %s\n", CRIU_VERSION); if (strcmp(CRIU_GITID, "0")) @@ -831,15 +847,15 @@ int parse_options(int argc, char **argv, bool *usage_error, bad_arg: if (idx < 0) /* short option */ - pr_msg("Error: invalid argument for -%c: %s\n", + pr_err("invalid argument for -%c: %s\n", opt, optarg); else /* long option */ - pr_msg("Error: invalid argument for --%s: %s\n", + pr_err("invalid argument for --%s: %s\n", long_opts[idx].name, optarg); return 1; } -int check_options() +int check_options(void) { if (opts.tcp_established_ok) pr_info("Will dump/restore TCP connections\n"); diff --git a/criu/cr-check.c b/criu/cr-check.c index 75a665cfbf..80df3f7cdc 100644 --- a/criu/cr-check.c +++ b/criu/cr-check.c @@ -51,7 +51,7 @@ #include "restorer.h" #include "uffd.h" -static char *feature_name(int (*func)()); +static char *feature_name(int (*func)(void)); static int check_tty(void) { @@ -62,7 +62,7 @@ static int check_tty(void) int ret = -1; if (ARRAY_SIZE(t.c_cc) < TERMIOS_NCC) { - pr_msg("struct termios has %d @c_cc while " + pr_err("struct termios has %d @c_cc while " "at least %d expected.\n", (int)ARRAY_SIZE(t.c_cc), TERMIOS_NCC); @@ -513,7 +513,7 @@ static int check_ipc(void) return -1; } -static int check_sigqueuinfo() +static int check_sigqueuinfo(void) { siginfo_t info = { .si_code = 1 }; @@ -960,7 +960,7 @@ static int clone_cb(void *_arg) { exit(0); } -static int check_clone_parent_vs_pid() +static int check_clone_parent_vs_pid(void) { struct clone_arg ca; pid_t pid; @@ -1224,6 +1224,16 @@ static int check_uffd_noncoop(void) return 0; } +static int check_clone3_set_tid(void) +{ + if (!kdat.has_clone3_set_tid) { + pr_warn("clone3() with set_tid not supported\n"); + return -1; + } + + return 0; +} + static int check_can_map_vdso(void) { if (kdat_can_map_vdso() == 1) @@ -1373,6 +1383,7 @@ int cr_check(void) ret |= check_sk_netns(); ret |= check_kcmp_epoll(); ret |= check_net_diag_raw(); + ret |= check_clone3_set_tid(); } /* @@ -1447,7 +1458,7 @@ static int check_external_net_ns(void) struct feature_list { char *name; - int (*func)(); + int (*func)(void); }; static struct feature_list feature_list[] = { @@ -1476,6 +1487,7 @@ static struct feature_list feature_list[] = { { "link_nsid", check_link_nsid}, { "kcmp_epoll", check_kcmp_epoll}, { "external_net_ns", check_external_net_ns}, + { "clone3_set_tid", check_clone3_set_tid}, { NULL, NULL }, }; @@ -1517,7 +1529,7 @@ int check_add_feature(char *feat) return -1; } -static char *feature_name(int (*func)()) +static char *feature_name(int (*func)(void)) { struct feature_list *fl; diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 9273fc0a51..7b46e663b5 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -80,6 +80,8 @@ #include "fault-injection.h" #include "dump.h" #include "eventpoll.h" +#include "img-remote.h" +#include "memfd.h" /* * Architectures can overwrite this function to restore register sets that @@ -414,7 +416,10 @@ static int dump_filemap(struct vma_area *vma_area, int fd) /* Flags will be set during restore in open_filmap() */ - ret = dump_one_reg_file_cond(fd, &id, &p); + if (vma->status & VMA_AREA_MEMFD) + ret = dump_one_memfd_cond(fd, &id, &p); + else + ret = dump_one_reg_file_cond(fd, &id, &p); vma->shmid = id; return ret; @@ -782,8 +787,6 @@ static int dump_task_core_all(struct parasite_ctl *ctl, img = img_from_set(cr_imgset, CR_FD_CORE); ret = pb_write_one(img, core, PB_CORE); - if (ret < 0) - goto err; err: pr_info("----------------------------------------\n"); @@ -1387,16 +1390,20 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) ret = compel_stop_daemon(parasite_ctl); if (ret) { - pr_err("Can't cure (pid: %d) from parasite\n", pid); - goto err; + pr_err("Can't stop daemon in parasite (pid: %d)\n", pid); + goto err_cure; } ret = dump_task_threads(parasite_ctl, item); if (ret) { pr_err("Can't dump threads\n"); - goto err; + goto err_cure; } + /* + * On failure local map will be cured in cr_dump_finish() + * for lazy pages. + */ if (opts.lazy_pages) ret = compel_cure_remote(parasite_ctl); else @@ -1429,13 +1436,15 @@ static int dump_one_task(struct pstree_item *item, InventoryEntry *parent_ie) err_cure: close_cr_imgset(&cr_imgset); err_cure_imgset: - compel_cure(parasite_ctl); + ret = compel_cure(parasite_ctl); + if (ret) + pr_err("Can't cure (pid: %d) from parasite\n", pid); goto err; } static int alarm_attempts = 0; -bool alarm_timeouted() { +bool alarm_timeouted(void) { return alarm_attempts > 0; } @@ -1452,7 +1461,7 @@ static void alarm_handler(int signo) BUG(); } -static int setup_alarm_handler() +static int setup_alarm_handler(void) { struct sigaction sa = { .sa_handler = alarm_handler, @@ -1487,6 +1496,9 @@ static int cr_pre_dump_finish(int status) if (ret) goto err; + he.has_pre_dump_mode = true; + he.pre_dump_mode = opts.pre_dump_mode; + pstree_switch_state(root_item, TASK_ALIVE); timing_stop(TIME_FROZEN); @@ -1512,7 +1524,15 @@ static int cr_pre_dump_finish(int status) goto err; mem_pp = dmpi(item)->mem_pp; - ret = page_xfer_dump_pages(&xfer, mem_pp); + + if (opts.pre_dump_mode == PRE_DUMP_READ) { + timing_stop(TIME_MEMWRITE); + ret = page_xfer_predump_pages(item->pid->real, + &xfer, mem_pp); + } + else { + ret = page_xfer_dump_pages(&xfer, mem_pp); + } xfer.close(&xfer); @@ -1522,7 +1542,8 @@ static int cr_pre_dump_finish(int status) timing_stop(TIME_MEMWRITE); destroy_page_pipe(mem_pp); - compel_cure_local(ctl); + if (compel_cure_local(ctl)) + pr_err("Can't cure local: something happened with mapping?\n"); } free_pstree(root_item); @@ -1563,6 +1584,11 @@ int cr_pre_dump_tasks(pid_t pid) */ rlimit_unlimit_nofile(); + if (opts.remote && push_snapshot_id() < 0) { + pr_err("Failed to push image namespace.\n"); + goto err; + } + root_item = alloc_pstree_item(); if (!root_item) goto err; @@ -1649,7 +1675,8 @@ static int cr_lazy_mem_dump(void) for_each_pstree_item(item) { if (item->pid->state != TASK_DEAD) { destroy_page_pipe(dmpi(item)->mem_pp); - compel_cure_local(dmpi(item)->parasite_ctl); + if (compel_cure_local(dmpi(item)->parasite_ctl)) + pr_err("Can't cure local: something happened with mapping?\n"); } } @@ -1739,6 +1766,11 @@ static int cr_dump_finish(int ret) close_service_fd(CR_PROC_FD_OFF); + if (opts.remote && (finish_remote_dump() < 0)) { + pr_err("Finish remote dump failed.\n"); + return post_dump_ret ? : 1; + } + if (ret) { pr_err("Dumping FAILED.\n"); } else { @@ -1767,6 +1799,11 @@ int cr_dump_tasks(pid_t pid) */ rlimit_unlimit_nofile(); + if (opts.remote && push_snapshot_id() < 0) { + pr_err("Failed to push image namespace.\n"); + goto err; + } + root_item = alloc_pstree_item(); if (!root_item) goto err; @@ -1839,6 +1876,9 @@ int cr_dump_tasks(pid_t pid) if (collect_namespaces(true) < 0) goto err; + if (collect_unix_bindmounts() < 0) + goto err; + glob_imgset = cr_glob_imgset_open(O_DUMP); if (!glob_imgset) goto err; @@ -1916,6 +1956,8 @@ int cr_dump_tasks(pid_t pid) if (ret) goto err; + he.has_pre_dump_mode = false; + ret = write_img_inventory(&he); if (ret) goto err; diff --git a/criu/cr-restore.c b/criu/cr-restore.c index b4530f8e5d..c79fe94779 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -23,10 +23,13 @@ #include #include "common/compiler.h" +#include "linux/mount.h" + #include "clone-noasan.h" #include "cr_options.h" #include "servicefd.h" #include "image.h" +#include "img-remote.h" #include "util.h" #include "util-pie.h" #include "criu-log.h" @@ -65,7 +68,6 @@ #include "timerfd.h" #include "action-scripts.h" #include "shmem.h" -#include #include "aio.h" #include "lsm.h" #include "seccomp.h" @@ -73,6 +75,8 @@ #include "sk-queue.h" #include "sigframe.h" #include "fdstore.h" +#include "memfd.h" +#include "string.h" #include "parasite-syscall.h" #include "files-reg.h" @@ -180,13 +184,13 @@ static int __restore_wait_inprogress_tasks(int participants) return 0; } -static int restore_wait_inprogress_tasks() +static int restore_wait_inprogress_tasks(void) { return __restore_wait_inprogress_tasks(0); } /* Wait all tasks except the current one */ -static int restore_wait_other_tasks() +static int restore_wait_other_tasks(void) { int participants, stage; @@ -249,6 +253,9 @@ static int crtools_prepare_shared(void) if (prepare_cgroup()) return -1; + if (unix_prepare_shared()) + return -1; + return 0; } @@ -269,7 +276,6 @@ static struct collect_image_info *cinfos[] = { }; static struct collect_image_info *cinfos_files[] = { - &unix_sk_cinfo, &fifo_cinfo, &pipe_cinfo, &nsfile_cinfo, @@ -286,12 +292,14 @@ static struct collect_image_info *cinfos_files[] = { &fanotify_cinfo, &fanotify_mark_cinfo, &ext_file_cinfo, + &memfd_cinfo, }; /* These images are required to restore namespaces */ static struct collect_image_info *before_ns_cinfos[] = { &tty_info_cinfo, /* Restore devpts content */ &tty_cdata, + &unix_sk_cinfo, }; static struct pprep_head *post_prepare_heads = NULL; @@ -1372,40 +1380,55 @@ static inline int fork_with_pid(struct pstree_item *item) if (!(ca.clone_flags & CLONE_NEWPID)) { char buf[32]; int len; - int fd; + int fd = -1; - fd = open_proc_rw(PROC_GEN, LAST_PID_PATH); - if (fd < 0) - goto err; + if (!kdat.has_clone3_set_tid) { + fd = open_proc_rw(PROC_GEN, LAST_PID_PATH); + if (fd < 0) + goto err; + } lock_last_pid(); - len = snprintf(buf, sizeof(buf), "%d", pid - 1); - if (write(fd, buf, len) != len) { - pr_perror("%d: Write %s to %s", pid, buf, LAST_PID_PATH); + if (!kdat.has_clone3_set_tid) { + len = snprintf(buf, sizeof(buf), "%d", pid - 1); + if (write(fd, buf, len) != len) { + pr_perror("%d: Write %s to %s", pid, buf, + LAST_PID_PATH); + close(fd); + goto err_unlock; + } close(fd); - goto err_unlock; } - close(fd); } else { BUG_ON(pid != INIT_PID); } - /* - * Some kernel modules, such as network packet generator - * run kernel thread upon net-namespace creattion taking - * the @pid we've been requeting via LAST_PID_PATH interface - * so that we can't restore a take with pid needed. - * - * Here is an idea -- unhare net namespace in callee instead. - */ - /* - * The cgroup namespace is also unshared explicitly in the - * move_in_cgroup(), so drop this flag here as well. - */ - close_pid_proc(); - ret = clone_noasan(restore_task_with_children, - (ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD, &ca); + if (kdat.has_clone3_set_tid) { + ret = clone3_with_pid_noasan(restore_task_with_children, + &ca, (ca.clone_flags & + ~(CLONE_NEWNET | CLONE_NEWCGROUP)), + SIGCHLD, pid); + } else { + /* + * Some kernel modules, such as network packet generator + * run kernel thread upon net-namespace creation taking + * the @pid we've been requesting via LAST_PID_PATH interface + * so that we can't restore a take with pid needed. + * + * Here is an idea -- unshare net namespace in callee instead. + */ + /* + * The cgroup namespace is also unshared explicitly in the + * move_in_cgroup(), so drop this flag here as well. + */ + close_pid_proc(); + ret = clone_noasan(restore_task_with_children, + (ca.clone_flags & + ~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD, + &ca); + } + if (ret < 0) { pr_perror("Can't fork for %d", pid); goto err_unlock; @@ -1585,27 +1608,39 @@ static void restore_pgid(void) futex_set_and_wake(&rsti(current)->pgrp_set, 1); } +static int __legacy_mount_proc(void) +{ + char proc_mountpoint[] = "/tmp/crtools-proc.XXXXXX"; + int fd; + + if (mkdtemp(proc_mountpoint) == NULL) { + pr_perror("mkdtemp failed %s", proc_mountpoint); + return -1; + } + + pr_info("Mount procfs in %s\n", proc_mountpoint); + if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { + pr_perror("mount failed"); + if (rmdir(proc_mountpoint)) + pr_perror("Unable to remove %s", proc_mountpoint); + return -1; + } + + fd = open_detach_mount(proc_mountpoint); + return fd; +} + static int mount_proc(void) { int fd, ret; - char proc_mountpoint[] = "crtools-proc.XXXXXX"; if (root_ns_mask == 0) fd = ret = open("/proc", O_DIRECTORY); else { - if (mkdtemp(proc_mountpoint) == NULL) { - pr_perror("mkdtemp failed %s", proc_mountpoint); - return -1; - } - - pr_info("Mount procfs in %s\n", proc_mountpoint); - if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) { - pr_perror("mount failed"); - rmdir(proc_mountpoint); - return -1; - } - - ret = fd = open_detach_mount(proc_mountpoint); + if (kdat.has_fsopen) + fd = ret = mount_detached_fs("proc"); + else + fd = ret = __legacy_mount_proc(); } if (fd >= 0) { @@ -1927,7 +1962,7 @@ static int catch_tasks(bool root_seized, enum trace_flags *flag) return 0; } -static int clear_breakpoints() +static int clear_breakpoints(void) { struct pstree_item *item; int ret = 0, i; @@ -1952,6 +1987,7 @@ static void finalize_restore(void) for_each_pstree_item(item) { pid_t pid = item->pid->real; struct parasite_ctl *ctl; + unsigned long restorer_addr; if (!task_alive(item)) continue; @@ -1961,7 +1997,9 @@ static void finalize_restore(void) if (ctl == NULL) continue; - compel_unmap(ctl, (unsigned long)rsti(item)->munmap_restorer); + restorer_addr = (unsigned long)rsti(item)->munmap_restorer; + if (compel_unmap(ctl, restorer_addr)) + pr_err("Failed to unmap restorer from %d\n", pid); xfree(ctl); @@ -1971,7 +2009,7 @@ static void finalize_restore(void) } } -static void finalize_restore_detach(int status) +static int finalize_restore_detach(void) { struct pstree_item *item; @@ -1985,16 +2023,21 @@ static void finalize_restore_detach(int status) for (i = 0; i < item->nr_threads; i++) { pid = item->threads[i].real; if (pid < 0) { - BUG_ON(status >= 0); - break; + pr_err("pstree item has unvalid pid %d\n", pid); + continue; } - if (arch_set_thread_regs_nosigrt(&item->threads[i])) + if (arch_set_thread_regs_nosigrt(&item->threads[i])) { pr_perror("Restoring regs for %d failed", pid); - if (ptrace(PTRACE_DETACH, pid, NULL, 0)) - pr_perror("Unable to execute %d", pid); + return -1; + } + if (ptrace(PTRACE_DETACH, pid, NULL, 0)) { + pr_perror("Unable to detach %d", pid); + return -1; + } } } + return 0; } static void ignore_kids(void) @@ -2192,6 +2235,10 @@ static int restore_root_task(struct pstree_item *init) if (ret < 0) goto out_kill; + ret = apply_memfd_seals(); + if (ret < 0) + goto out_kill; + /* * Zombies die after CR_STATE_RESTORE which is switched * by root task, not by us. See comment before CR_STATE_FORKING @@ -2252,32 +2299,37 @@ static int restore_root_task(struct pstree_item *init) /* * ------------------------------------------------------------- - * Below this line nothing should fail, because network is unlocked + * Network is unlocked. If something fails below - we lose data + * or a connection. */ attach_to_tasks(root_seized); - ret = restore_switch_stage(CR_STATE_RESTORE_CREDS); - BUG_ON(ret); + if (restore_switch_stage(CR_STATE_RESTORE_CREDS)) + goto out_kill_network_unlocked; timing_stop(TIME_RESTORE); - ret = catch_tasks(root_seized, &flag); + if (catch_tasks(root_seized, &flag)) { + pr_err("Can't catch all tasks\n"); + goto out_kill_network_unlocked; + } if (lazy_pages_finish_restore()) - goto out_kill; + goto out_kill_network_unlocked; - pr_info("Restore finished successfully. Resuming tasks.\n"); __restore_switch_stage(CR_STATE_COMPLETE); - if (ret == 0) - ret = compel_stop_on_syscall(task_entries->nr_threads, - __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + ret = compel_stop_on_syscall(task_entries->nr_threads, + __NR(rt_sigreturn, 0), __NR(rt_sigreturn, 1), flag); + if (ret) { + pr_err("Can't stop all tasks on rt_sigreturn\n"); + goto out_kill_network_unlocked; + } if (clear_breakpoints()) pr_err("Unable to flush breakpoints\n"); - if (ret == 0) - finalize_restore(); + finalize_restore(); ret = run_scripts(ACT_PRE_RESUME); if (ret) @@ -2289,8 +2341,10 @@ static int restore_root_task(struct pstree_item *init) fini_cgroup(); /* Detaches from processes and they continue run through sigreturn. */ - finalize_restore_detach(ret); + if (finalize_restore_detach()) + goto out_kill_network_unlocked; + pr_info("Restore finished successfully. Tasks resumed.\n"); write_stats(RESTORE_STATS); ret = run_scripts(ACT_POST_RESUME); @@ -2302,6 +2356,8 @@ static int restore_root_task(struct pstree_item *init) return 0; +out_kill_network_unlocked: + pr_err("Killing processes because of failure on restore.\nThe Network was unlocked so some data or a connection may have been lost.\n"); out_kill: /* * The processes can be killed only when all of them have been created, @@ -2423,6 +2479,11 @@ int cr_restore_tasks(void) goto err; ret = restore_root_task(root_item); + + if (opts.remote && (finish_remote_restore() < 0)) { + pr_err("Finish remote restore failed.\n"); + goto err; + } err: cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret); return ret; @@ -3096,7 +3157,7 @@ rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_profile = lsm_profile; - strncpy(args->lsm_profile, rendered, lsm_profile_len); + strlcpy(args->lsm_profile, rendered, lsm_profile_len + 1); xfree(rendered); } } else { @@ -3130,7 +3191,7 @@ rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos) args = rst_mem_remap_ptr(this_pos, RM_PRIVATE); args->lsm_sockcreate = lsm_sockcreate; - strncpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len); + strlcpy(args->lsm_sockcreate, rendered, lsm_sockcreate_len + 1); xfree(rendered); } } else { @@ -3326,10 +3387,13 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns vdso_maps_rt = vdso_maps; /* * Figure out how much memory runtime vdso and vvar will need. + * Check if vDSO or VVAR is not provided by kernel. */ - vdso_rt_size = vdso_maps_rt.sym.vdso_size; - if (vdso_rt_size && vdso_maps_rt.sym.vvar_size) - vdso_rt_size += ALIGN(vdso_maps_rt.sym.vvar_size, PAGE_SIZE); + if (vdso_maps_rt.sym.vdso_size != VDSO_BAD_SIZE) { + vdso_rt_size = vdso_maps_rt.sym.vdso_size; + if (vdso_maps_rt.sym.vvar_size != VVAR_BAD_SIZE) + vdso_rt_size += vdso_maps_rt.sym.vvar_size; + } task_args->bootstrap_len += vdso_rt_size; /* @@ -3557,6 +3621,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_rt_size = vdso_rt_size; task_args->can_map_vdso = kdat.can_map_vdso; + task_args->has_clone3_set_tid = kdat.has_clone3_set_tid; new_sp = restorer_stack(task_args->t->mz); diff --git a/criu/cr-service.c b/criu/cr-service.c index 0938db02b0..279016bcd3 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -27,6 +27,7 @@ #include "cr-service.h" #include "cr-service-const.h" #include "page-xfer.h" +#include "protobuf.h" #include "net.h" #include "mount.h" #include "filesystems.h" @@ -49,18 +50,21 @@ unsigned int service_sk_ino = -1; static int recv_criu_msg(int socket_fd, CriuReq **req) { - unsigned char *buf; - int len; + u8 local[PB_PKOBJ_LOCAL_SIZE]; + void *buf = (void *)&local; + int len, exit_code = -1; len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK); if (len == -1) { pr_perror("Can't read request"); - return -1; + goto err; } - buf = xmalloc(len); - if (!buf) - return -ENOMEM; + if (len > sizeof(local)) { + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + } len = recv(socket_fd, buf, len, MSG_TRUNC); if (len == -1) { @@ -80,43 +84,47 @@ static int recv_criu_msg(int socket_fd, CriuReq **req) goto err; } - xfree(buf); - return 0; + exit_code = 0; err: - xfree(buf); - return -1; + if (buf != (void *)&local) + xfree(buf); + return exit_code; } static int send_criu_msg_with_fd(int socket_fd, CriuResp *msg, int fd) { - unsigned char *buf; - int len, ret; + u8 local[PB_PKOBJ_LOCAL_SIZE]; + void *buf = (void *)&local; + int len, exit_code = -1; len = criu_resp__get_packed_size(msg); - buf = xmalloc(len); - if (!buf) - return -ENOMEM; + if (len > sizeof(local)) { + buf = xmalloc(len); + if (!buf) + return -ENOMEM; + } if (criu_resp__pack(msg, buf) != len) { pr_perror("Failed packing response"); goto err; } - if (fd >= 0) { - ret = send_fds(socket_fd, NULL, 0, &fd, 1, buf, len); - } else - ret = write(socket_fd, buf, len); - if (ret < 0) { + if (fd >= 0) + exit_code = send_fds(socket_fd, NULL, 0, &fd, 1, buf, len); + else + exit_code = write(socket_fd, buf, len); + + if (exit_code < 0) { pr_perror("Can't send response"); goto err; } - xfree(buf); - return 0; + exit_code = 0; err: - xfree(buf); - return -1; + if (buf != (void *)&local) + xfree(buf); + return exit_code; } static int send_criu_msg(int socket_fd, CriuResp *msg) @@ -473,6 +481,19 @@ static int setup_opts_from_req(int sk, CriuOpts *req) opts.lazy_pages = req->lazy_pages; } + if (req->has_pre_dump_mode) { + switch (req->pre_dump_mode) { + case CRIU_PRE_DUMP_MODE__SPLICE: + opts.pre_dump_mode = PRE_DUMP_SPLICE; + break; + case CRIU_PRE_DUMP_MODE__VM_READ: + opts.pre_dump_mode = PRE_DUMP_READ; + break; + default: + goto err; + } + } + if (req->ps) { opts.port = (short)req->ps->port; @@ -608,6 +629,9 @@ static int setup_opts_from_req(int sk, CriuOpts *req) goto err; } + if (req->cgroup_yard) + SET_CHAR_OPTS(cgroup_yard, req->cgroup_yard); + if (req->tls_cacert) SET_CHAR_OPTS(tls_cacert, req->tls_cacert); if (req->tls_cacrl) @@ -1254,7 +1278,7 @@ static void reap_worker(int signo) } } -static int setup_sigchld_handler() +static int setup_sigchld_handler(void) { struct sigaction action; @@ -1271,7 +1295,7 @@ static int setup_sigchld_handler() return 0; } -static int restore_sigchld_handler() +static int restore_sigchld_handler(void) { struct sigaction action; diff --git a/criu/crtools.c b/criu/crtools.c index a94875684e..a9910a7c01 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -46,6 +46,14 @@ #include "setproctitle.h" #include "sysctl.h" +#include "img-remote.h" + +void flush_early_log_to_stderr(void) __attribute__((destructor)); + +void flush_early_log_to_stderr(void) +{ + flush_early_log_buffer(STDERR_FILENO); +} int main(int argc, char *argv[], char *envp[]) { @@ -95,10 +103,8 @@ int main(int argc, char *argv[], char *envp[]) return cr_service_work(atoi(argv[2])); } - if (check_options()) { - flush_early_log_buffer(STDERR_FILENO); + if (check_options()) return 1; - } if (opts.imgs_dir == NULL) SET_CHAR_OPTS(imgs_dir, "."); @@ -107,7 +113,7 @@ int main(int argc, char *argv[], char *envp[]) SET_CHAR_OPTS(work_dir, opts.imgs_dir); if (optind >= argc) { - pr_msg("Error: command is required\n"); + pr_err("command is required\n"); goto usage; } @@ -115,17 +121,17 @@ int main(int argc, char *argv[], char *envp[]) if (has_exec_cmd) { if (!has_sub_command) { - pr_msg("Error: --exec-cmd requires a command\n"); + pr_err("--exec-cmd requires a command\n"); goto usage; } if (strcmp(argv[optind], "restore")) { - pr_msg("Error: --exec-cmd is available for the restore command only\n"); + pr_err("--exec-cmd is available for the restore command only\n"); goto usage; } if (opts.restore_detach) { - pr_msg("Error: --restore-detached and --exec-cmd cannot be used together\n"); + pr_err("--restore-detached and --exec-cmd cannot be used together\n"); goto usage; } @@ -137,7 +143,7 @@ int main(int argc, char *argv[], char *envp[]) } else { /* No subcommands except for cpuinfo and restore --exec-cmd */ if (strcmp(argv[optind], "cpuinfo") && has_sub_command) { - pr_msg("Error: excessive parameter%s for command %s\n", + pr_err("excessive parameter%s for command %s\n", (argc - optind) > 2 ? "s" : "", argv[optind]); goto usage; } @@ -171,6 +177,9 @@ int main(int argc, char *argv[], char *envp[]) if (kerndat_init()) return 1; + if (fault_injected(FI_CANNOT_MAP_VDSO)) + kdat.can_map_vdso = 0; + if (opts.deprecated_ok) pr_debug("DEPRECATED ON\n"); @@ -228,6 +237,22 @@ int main(int argc, char *argv[], char *envp[]) if (!strcmp(argv[optind], "page-server")) return cr_page_server(opts.daemon_mode, false, -1) != 0; + if (!strcmp(argv[optind], "image-cache")) { + if (!opts.port) + goto opt_port_missing; + return image_cache(opts.daemon_mode, DEFAULT_CACHE_SOCKET); + } + + if (!strcmp(argv[optind], "image-proxy")) { + if (!opts.addr) { + pr_err("address not specified\n"); + return 1; + } + if (!opts.port) + goto opt_port_missing; + return image_proxy(opts.daemon_mode, DEFAULT_PROXY_SOCKET); + } + if (!strcmp(argv[optind], "service")) return cr_service(opts.daemon_mode); @@ -236,7 +261,7 @@ int main(int argc, char *argv[], char *envp[]) if (!strcmp(argv[optind], "cpuinfo")) { if (!argv[optind + 1]) { - pr_msg("Error: cpuinfo requires an action: dump or check\n"); + pr_err("cpuinfo requires an action: dump or check\n"); goto usage; } if (!strcmp(argv[optind + 1], "dump")) @@ -246,17 +271,17 @@ int main(int argc, char *argv[], char *envp[]) } if (!strcmp(argv[optind], "exec")) { - pr_msg("The \"exec\" action is deprecated by the Compel library.\n"); + pr_err("The \"exec\" action is deprecated by the Compel library.\n"); return -1; } if (!strcmp(argv[optind], "show")) { - pr_msg("The \"show\" action is deprecated by the CRIT utility.\n"); - pr_msg("To view an image use the \"crit decode -i $name --pretty\" command.\n"); + pr_err("The \"show\" action is deprecated by the CRIT utility.\n"); + pr_err("To view an image use the \"crit decode -i $name --pretty\" command.\n"); return -1; } - pr_msg("Error: unknown command: %s\n", argv[optind]); + pr_err("unknown command: %s\n", argv[optind]); usage: pr_msg("\n" "Usage:\n" @@ -267,6 +292,8 @@ int main(int argc, char *argv[], char *envp[]) " criu service []\n" " criu dedup\n" " criu lazy-pages -D DIR []\n" +" criu image-cache []\n" +" criu image-proxy []\n" "\n" "Commands:\n" " dump checkpoint a process/tree identified by pid\n" @@ -278,6 +305,8 @@ int main(int argc, char *argv[], char *envp[]) " dedup remove duplicates in memory dump\n" " cpuinfo dump writes cpu information into image file\n" " cpuinfo check validates cpu information read from image file\n" +" image-proxy launch dump-side proxy to sent images\n" +" image-cache launch restore-side cache to receive images\n" ); if (usage_error) { @@ -330,6 +359,8 @@ int main(int argc, char *argv[], char *envp[]) " macvlan[IFNAME]:OUTNAME\n" " mnt[COOKIE]:ROOT\n" "\n" +" --remote dump/restore images directly to/from remote node using\n" +" image-proxy/image-cache\n" "* Special resources support:\n" " --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" " --" SK_INFLIGHT_PARAM " skip (ignore) in-flight TCP connections\n" @@ -366,6 +397,10 @@ int main(int argc, char *argv[], char *envp[]) " --cgroup-dump-controller NAME\n" " define cgroup controller to be dumped\n" " and skip anything else present in system\n" +" --cgroup-yard PATH\n" +" instead of trying to mount cgroups in CRIU, provide\n" +" a path to a directory with already created cgroup yard.\n" +" Useful if you don't want to grant CAP_SYS_ADMIN to CRIU\n" " --lsm-profile TYPE:NAME\n" " Specify an LSM profile to be used during restore.\n" " The type can be either 'apparmor' or 'selinux'.\n" @@ -380,6 +415,7 @@ int main(int argc, char *argv[], char *envp[]) " pipe[inode]\n" " socket[inode]\n" " file[mnt_id:inode]\n" +" /memfd:name\n" " path/to/file\n" " --empty-ns net Create a namespace, but don't restore its properties\n" " (assuming it will be restored by action scripts)\n" @@ -419,6 +455,8 @@ int main(int argc, char *argv[], char *envp[]) " pages images of previous dump\n" " when used on restore, as soon as page is restored, it\n" " will be punched from the image\n" +" --pre-dump-mode splice - parasite based pre-dumping (default)\n" +" read - process_vm_readv syscall based pre-dumping\n" "\n" "Page/Service server options:\n" " --address ADDR address of server or service\n" @@ -445,7 +483,11 @@ int main(int argc, char *argv[], char *envp[]) return 0; +opt_port_missing: + pr_err("port not specified\n"); + return 1; + opt_pid_missing: - pr_msg("Error: pid not specified\n"); + pr_err("pid not specified\n"); return 1; } diff --git a/criu/fault-injection.c b/criu/fault-injection.c index 4128814d52..4b06500083 100644 --- a/criu/fault-injection.c +++ b/criu/fault-injection.c @@ -3,7 +3,7 @@ enum faults fi_strategy; -int fault_injection_init() +int fault_injection_init(void) { char *val; int start; diff --git a/criu/files-reg.c b/criu/files-reg.c index 2f68bc03fe..1d24cc526e 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -33,8 +33,10 @@ #include "namespaces.h" #include "proc_parse.h" #include "pstree.h" +#include "string.h" #include "fault-injection.h" #include "external.h" +#include "memfd.h" #include "protobuf.h" #include "util.h" @@ -155,11 +157,32 @@ static int copy_chunk_from_file(int fd, int img, off_t off, size_t len) char *buf = NULL; int ret; - while (len > 0) { - ret = sendfile(img, fd, &off, len); - if (ret <= 0) { - pr_perror("Can't send ghost to image"); + if (opts.remote) { + buf = xmalloc(BUFSIZE); + if (!buf) return -1; + } + + while (len > 0) { + if (opts.remote) { + ret = pread(fd, buf, min_t(size_t, BUFSIZE, len), off); + if (ret <= 0) { + pr_perror("Can't read from ghost file"); + xfree(buf); + return -1; + } + if (write(img, buf, ret) != ret) { + pr_perror("Can't write to image"); + xfree(buf); + return -1; + } + off += ret; + } else { + ret = sendfile(img, fd, &off, len); + if (ret <= 0) { + pr_perror("Can't send ghost to image"); + return -1; + } } len -= ret; @@ -214,15 +237,35 @@ static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) char *buf = NULL; int ret; - while (len > 0) { - if (lseek(fd, off, SEEK_SET) < 0) { - pr_perror("Can't seek file"); - return -1; - } - ret = sendfile(fd, img, NULL, len); - if (ret < 0) { - pr_perror("Can't send data"); + if (opts.remote) { + buf = xmalloc(BUFSIZE); + if (!buf) return -1; + } + + while (len > 0) { + if (opts.remote) { + ret = read(img, buf, min_t(size_t, BUFSIZE, len)); + if (ret <= 0) { + pr_perror("Can't read from image"); + xfree(buf); + return -1; + } + if (pwrite(fd, buf, ret, off) != ret) { + pr_perror("Can't write to file"); + xfree(buf); + return -1; + } + } else { + if (lseek(fd, off, SEEK_SET) < 0) { + pr_perror("Can't seek file"); + return -1; + } + ret = sendfile(fd, img, NULL, len); + if (ret < 0) { + pr_perror("Can't send data"); + return -1; + } } off += ret; @@ -280,19 +323,53 @@ static int mkreg_ghost(char *path, GhostFileEntry *gfe, struct cr_img *img) return ret; } +static int mklnk_ghost(char *path, GhostFileEntry *gfe) +{ + if (!gfe->symlnk_target) { + pr_err("Ghost symlink target is NULL for %s. Image from old CRIU?\n", path); + return -1; + } + + if (symlink(gfe->symlnk_target, path) < 0) { + /* + * ENOENT case is OK + * Take a look closer on create_ghost() function + */ + if (errno != ENOENT) + pr_perror("symlink(%s, %s) failed", gfe->symlnk_target, path); + return -1; + } + + return 0; +} + static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe) { struct timeval tv[2]; int ret = -1; - if (chown(path, gfe->uid, gfe->gid) < 0) { - pr_perror("Can't reset user/group on ghost %s", path); - goto err; - } + if (S_ISLNK(gfe->mode)) { + if (lchown(path, gfe->uid, gfe->gid) < 0) { + pr_perror("Can't reset user/group on ghost %s", path); + goto err; + } - if (chmod(path, gfe->mode)) { - pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); - goto err; + /* + * We have no lchmod() function, and fchmod() will fail on + * O_PATH | O_NOFOLLOW fd. Yes, we have fchmodat() + * function and flag AT_SYMLINK_NOFOLLOW described in + * man 2 fchmodat, but it is not currently implemented. %) + */ + } else { + if (chown(path, gfe->uid, gfe->gid) < 0) { + pr_perror("Can't reset user/group on ghost %s", path); + goto err; + } + + if (chmod(path, gfe->mode)) { + pr_perror("Can't set perms %o on ghost %s", gfe->mode, path); + goto err; + } } if (gfe->atim) { @@ -351,6 +428,9 @@ static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_im } else if (S_ISDIR(gfe->mode)) { if ((ret = mkdirpat(AT_FDCWD, path, gfe->mode)) < 0) msg = "Can't make ghost dir"; + } else if (S_ISLNK(gfe->mode)) { + if ((ret = mklnk_ghost(path, gfe)) < 0) + msg = "Can't create ghost symlink"; } else { if ((ret = mkreg_ghost(path, gfe, img)) < 0) msg = "Can't create ghost regfile"; @@ -456,7 +536,7 @@ static int open_remap_ghost(struct reg_file_info *rfi, gf->remap.rmnt_id = rfi->rfe->mnt_id; if (S_ISDIR(gfe->mode)) - strncpy(gf->remap.rpath, rfi->path, PATH_MAX); + strlcpy(gf->remap.rpath, rfi->path, PATH_MAX); else ghost_path(gf->remap.rpath, PATH_MAX, rfi, rpe); @@ -738,6 +818,7 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de int exit_code = -1; GhostFileEntry gfe = GHOST_FILE_ENTRY__INIT; Timeval atim = TIMEVAL__INIT, mtim = TIMEVAL__INIT; + char pathbuf[PATH_MAX]; pr_info("Dumping ghost file contents (id %#x)\n", id); @@ -771,19 +852,47 @@ static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_de gfe.size = st->st_size; } + /* + * We set gfe.symlnk_target only if we need to dump + * symlink content, otherwise we leave it NULL. + * It will be taken into account on restore in mklnk_ghost function. + */ + if (S_ISLNK(st->st_mode)) { + ssize_t ret; + + /* + * We assume that _fd opened with O_PATH | O_NOFOLLOW + * flags because S_ISLNK(st->st_mode). With current kernel version, + * it's looks like correct assumption in any case. + */ + ret = readlinkat(_fd, "", pathbuf, sizeof(pathbuf) - 1); + if (ret < 0) { + pr_perror("Can't readlinkat"); + goto err_out; + } + + pathbuf[ret] = 0; + + if (ret != st->st_size) { + pr_err("Buffer for readlinkat is too small: ret %zd, st_size %"PRId64", buf %u %s\n", + ret, st->st_size, PATH_MAX, pathbuf); + goto err_out; + } + + gfe.symlnk_target = pathbuf; + } + if (pb_write_one(img, &gfe, PB_GHOST_FILE)) goto err_out; if (S_ISREG(st->st_mode)) { int fd, ret; - char lpath[PSFDS]; /* * Reopen file locally since it may have no read * permissions when drained */ - sprintf(lpath, "/proc/self/fd/%d", _fd); - fd = open(lpath, O_RDONLY); + fd = open_proc(PROC_SELF, "fd/%d", _fd); if (fd < 0) { pr_perror("Can't open ghost original file"); goto err_out; @@ -844,10 +953,13 @@ static int dump_ghost_remap(char *path, const struct stat *st, gf->dev = phys_dev; gf->ino = st->st_ino; gf->id = ghost_file_ids++; - list_add_tail(&gf->list, &ghost_files); - if (dump_ghost_file(lfd, gf->id, st, phys_dev)) + if (dump_ghost_file(lfd, gf->id, st, phys_dev)) { + xfree(gf); return -1; + } + + list_add_tail(&gf->list, &ghost_files); dump_entry: rpe.orig_id = id; @@ -1113,6 +1225,7 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, int ret, mntns_root; struct stat pst; const struct stat *ost = &parms->stat; + int flags = 0; if (parms->fs_type == PROC_SUPER_MAGIC) { /* The file points to /proc/pid/ where pid is a dead @@ -1209,7 +1322,10 @@ static int check_path_remap(struct fd_link *link, const struct fd_parms *parms, if (mntns_root < 0) return -1; - ret = fstatat(mntns_root, rpath, &pst, 0); + if (S_ISLNK(parms->stat.st_mode)) + flags = AT_SYMLINK_NOFOLLOW; + + ret = fstatat(mntns_root, rpath, &pst, flags); if (ret < 0) { /* * Linked file, but path is not accessible (unless any @@ -1773,11 +1889,17 @@ static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg) if (fd < 0) return fd; - if ((rfi->rfe->pos != -1ULL) && - lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { - pr_perror("Can't restore file pos"); - close(fd); - return -1; + /* + * O_PATH opened files carry empty fops in kernel, + * just ignore positioning at all. + */ + if (!(rfi->rfe->flags & O_PATH)) { + if (rfi->rfe->pos != -1ULL && + lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) { + pr_perror("Can't restore file pos"); + close(fd); + return -1; + } } return fd; @@ -1876,7 +1998,10 @@ static int open_filemap(int pid, struct vma_area *vma) flags = vma->e->fdflags; if (ctx.flags != flags || ctx.desc != vma->vmfd) { - ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); + if (vma->e->status & VMA_AREA_MEMFD) + ret = memfd_open(vma->vmfd, &flags); + else + ret = open_path(vma->vmfd, do_open_reg_noseek_flags, &flags); if (ret < 0) return ret; @@ -1906,7 +2031,10 @@ int collect_filemap(struct vma_area *vma) vma->e->fdflags = O_RDONLY; } - fd = collect_special_file(vma->e->shmid); + if (vma->e->status & VMA_AREA_MEMFD) + fd = collect_memfd(vma->e->shmid); + else + fd = collect_special_file(vma->e->shmid); if (!fd) return -1; diff --git a/criu/files.c b/criu/files.c index ffdaa459fc..a1fd267642 100644 --- a/criu/files.c +++ b/criu/files.c @@ -34,6 +34,7 @@ #include "sk-packet.h" #include "mount.h" #include "signalfd.h" +#include "memfd.h" #include "namespaces.h" #include "tun.h" #include "timerfd.h" @@ -44,6 +45,7 @@ #include "autofs.h" #include "parasite.h" #include "parasite-syscall.h" +#include "string.h" #include "kerndat.h" #include "fdstore.h" @@ -290,8 +292,7 @@ static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link) char buf[PATH_MAX]; int n; - strncpy(buf, link->name, PATH_MAX); - buf[PATH_MAX - 1] = 0; + strlcpy(buf, link->name, PATH_MAX); n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2); if (n >= PATH_MAX) { pr_err("Not enough space to replace %s\n", buf); @@ -382,7 +383,13 @@ static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, p->fs_type = fsbuf.f_type; p->fd = fd; p->pos = fdinfo.pos; - p->flags = fdinfo.flags; + /* + * The kernel artificially adds the O_CLOEXEC flag on the file pointer + * flags by looking at the flags on the file descriptor (see kernel + * code fs/proc/fd.c). FD_CLOEXEC is a file descriptor property, which + * is saved in fd_flags. + */ + p->flags = fdinfo.flags & ~O_CLOEXEC; p->mnt_id = fdinfo.mnt_id; p->pid = owner_pid->real; p->fd_flags = opts->flags; @@ -392,7 +399,10 @@ static int fill_fd_params(struct pid *owner_pid, int fd, int lfd, pr_info("%d fdinfo %d: pos: %#16"PRIx64" flags: %16o/%#x\n", owner_pid->real, fd, p->pos, p->flags, (int)p->fd_flags); - ret = fcntl(lfd, F_GETSIG, 0); + if (p->flags & O_PATH) + ret = 0; + else + ret = fcntl(lfd, F_GETSIG, 0); if (ret < 0) { pr_perror("Can't get owner signum on %d", lfd); return -1; @@ -535,18 +545,23 @@ static int dump_one_file(struct pid *pid, int fd, int lfd, struct fd_opts *opts, return do_dump_gen_file(&p, lfd, ops, e); } - if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode)) { + if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode) || + S_ISLNK(p.stat.st_mode)) { if (fill_fdlink(lfd, &p, &link)) return -1; p.link = &link; - if (link.name[1] == '/') - return do_dump_gen_file(&p, lfd, ®file_dump_ops, e); - if (check_ns_proc(&link)) - return do_dump_gen_file(&p, lfd, &nsfile_dump_ops, e); + if (is_memfd(p.stat.st_dev)) + ops = &memfd_dump_ops; + else if (link.name[1] == '/') + ops = ®file_dump_ops; + else if (check_ns_proc(&link)) + ops = &nsfile_dump_ops; + else + return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e); - return dump_unsupp_fd(&p, lfd, "reg", link.name + 1, e); + return do_dump_gen_file(&p, lfd, ops, e); } if (S_ISFIFO(p.stat.st_mode)) { @@ -1597,7 +1612,7 @@ int inherit_fd_lookup_id(char *id) bool inherited_fd(struct file_desc *d, int *fd_p) { - char buf[32], *id_str; + char buf[PATH_MAX], *id_str; int i_fd; if (!d->ops->name) @@ -1715,6 +1730,9 @@ static int collect_one_file(void *o, ProtobufCMessage *base, struct cr_img *i) case FD_TYPES__TTY: ret = collect_one_file_entry(fe, fe->tty->id, &fe->tty->base, &tty_cinfo); break; + case FD_TYPES__MEMFD: + ret = collect_one_file_entry(fe, fe->memfd->id, &fe->memfd->base, &memfd_cinfo); + break; } return ret; diff --git a/criu/filesystems.c b/criu/filesystems.c index 1e4550b371..d76b182918 100644 --- a/criu/filesystems.c +++ b/criu/filesystems.c @@ -747,6 +747,11 @@ static struct fstype fstypes[] = { .code = FSTYPE__CGROUP, .parse = cgroup_parse, .sb_equal = cgroup_sb_equal, + }, { + .name = "cgroup2", + .code = FSTYPE__CGROUP2, + .parse = cgroup_parse, + .sb_equal = cgroup_sb_equal, }, { .name = "aufs", .code = FSTYPE__AUFS, diff --git a/criu/image-desc.c b/criu/image-desc.c index 053e7af219..b538a76ea5 100644 --- a/criu/image-desc.c +++ b/criu/image-desc.c @@ -66,6 +66,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY(FS, "fs-%u"), FD_ENTRY(REMAP_FPATH, "remap-fpath"), FD_ENTRY_F(GHOST_FILE, "ghost-file-%x", O_NOBUF), + FD_ENTRY_F(MEMFD_INODE, "memfd-%u", O_NOBUF), FD_ENTRY(TCP_STREAM, "tcp-stream-%x"), FD_ENTRY(MNTS, "mountpoints-%u"), FD_ENTRY(NETDEV, "netdev-%u"), @@ -76,6 +77,7 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { FD_ENTRY_F(RULE, "rule-%u", O_NOBUF), FD_ENTRY_F(IPTABLES, "iptables-%u", O_NOBUF), FD_ENTRY_F(IP6TABLES, "ip6tables-%u", O_NOBUF), + FD_ENTRY_F(NFTABLES, "nftables-%u", O_NOBUF), FD_ENTRY_F(TMPFS_IMG, "tmpfs-%u.tar.gz", O_NOBUF), FD_ENTRY_F(TMPFS_DEV, "tmpfs-dev-%u.tar.gz", O_NOBUF), FD_ENTRY_F(AUTOFS, "autofs-%u", O_NOBUF), @@ -112,9 +114,4 @@ struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = { .magic = IRMAP_CACHE_MAGIC, .oflags = O_SERVICE | O_FORCE_LOCAL, }, - - [CR_FD_FILE_LOCKS_PID] = { - .fmt = "filelocks-%u.img", - .magic = FILE_LOCKS_MAGIC, - }, }; diff --git a/criu/image.c b/criu/image.c index 2eb9269296..1a484f192a 100644 --- a/criu/image.c +++ b/criu/image.c @@ -17,6 +17,7 @@ #include "images/inventory.pb-c.h" #include "images/pagemap.pb-c.h" #include "proc_parse.h" +#include "img-remote.h" #include "namespaces.h" bool ns_per_id = false; @@ -190,7 +191,7 @@ int prepare_inventory(InventoryEntry *he) struct dmp_info d; } crt = { .i.pid = &pid }; - pr_info("Perparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); + pr_info("Preparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1); he->img_version = CRTOOLS_IMAGES_V1_1; he->fdinfo_per_id = true; @@ -390,6 +391,50 @@ static int img_write_magic(struct cr_img *img, int oflags, int type) return write_img(img, &imgset_template[type].magic); } +int do_open_remote_image(int dfd, char *path, int flags) +{ + char *snapshot_id = NULL; + int ret, save; + + /* When using namespaces, the current dir is changed so we need to + * change to previous working dir and back to correctly open the image + * proxy and cache sockets. */ + save = open(".", O_RDONLY); + if (save < 0) { + pr_perror("unable to open current working directory"); + return -1; + } + + if (fchdir(get_service_fd(IMG_FD_OFF)) < 0) { + pr_perror("fchdir to dfd failed!\n"); + close(save); + return -1; + } + + snapshot_id = get_snapshot_id_from_idx(dfd); + + if (snapshot_id == NULL) + ret = -1; + else if (flags == O_RDONLY) { + pr_debug("do_open_remote_image RDONLY path=%s snapshot_id=%s\n", + path, snapshot_id); + ret = read_remote_image_connection(snapshot_id, path); + } else { + pr_debug("do_open_remote_image WRONLY path=%s snapshot_id=%s\n", + path, snapshot_id); + ret = write_remote_image_connection(snapshot_id, path, O_WRONLY); + } + + if (fchdir(save) < 0) { + pr_perror("fchdir to save failed"); + close(save); + return -1; + } + close(save); + + return ret; +} + struct openat_args { char path[PATH_MAX]; int flags; @@ -415,24 +460,28 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL); - /* - * For pages images dedup we need to open images read-write on - * restore, that may require proper capabilities, so we ask - * usernsd to do it for us - */ - if (root_ns_mask & CLONE_NEWUSER && - type == CR_FD_PAGES && oflags & O_RDWR) { - struct openat_args pa = { - .flags = flags, - .err = 0, - .mode = CR_FD_PERM, - }; - snprintf(pa.path, PATH_MAX, "%s", path); - ret = userns_call(userns_openat, UNS_FDOUT, &pa, sizeof(struct openat_args), dfd); - if (ret < 0) - errno = pa.err; - } else - ret = openat(dfd, path, flags, CR_FD_PERM); + if (opts.remote && !(oflags & O_FORCE_LOCAL)) + ret = do_open_remote_image(dfd, path, flags); + else { + /* + * For pages images dedup we need to open images read-write on + * restore, that may require proper capabilities, so we ask + * usernsd to do it for us + */ + if (root_ns_mask & CLONE_NEWUSER && + type == CR_FD_PAGES && oflags & O_RDWR) { + struct openat_args pa = { + .flags = flags, + .err = 0, + .mode = CR_FD_PERM, + }; + snprintf(pa.path, PATH_MAX, "%s", path); + ret = userns_call(userns_openat, UNS_FDOUT, &pa, sizeof(struct openat_args), dfd); + if (ret < 0) + errno = pa.err; + } else + ret = openat(dfd, path, flags, CR_FD_PERM); + } if (ret < 0) { if (!(flags & O_CREAT) && (errno == ENOENT || ret == -ENOENT)) { pr_info("No %s image\n", path); @@ -535,7 +584,9 @@ int open_image_dir(char *dir) return -1; fd = ret; - if (opts.img_parent) { + if (opts.remote) { + init_snapshot_id(dir); + } else if (opts.img_parent) { ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK); if (ret < 0 && errno != EEXIST) { pr_perror("Can't link parent snapshot"); diff --git a/criu/img-cache.c b/criu/img-cache.c new file mode 100644 index 0000000000..3887b500d2 --- /dev/null +++ b/criu/img-cache.c @@ -0,0 +1,56 @@ +#include +#include + +#include "cr_options.h" +#include "img-remote.h" +#include "util.h" + +int image_cache(bool background, char *local_cache_path) +{ + int tmp; + + pr_info("Proxy to Cache Port %u, CRIU to Cache Path %s\n", + opts.port, local_cache_path); + restoring = true; + + if (opts.ps_socket != -1) { + remote_sk = opts.ps_socket; + pr_info("Re-using ps socket %d\n", remote_sk); + } else { + remote_sk = setup_tcp_server("image cache"); + if (remote_sk < 0) { + pr_perror("Unable to open proxy to cache TCP socket"); + return -1; + } + // Wait to accept connection from proxy. + tmp = accept(remote_sk, NULL, 0); + if (tmp < 0) { + pr_perror("Unable to accept remote image connection" + " from image proxy"); + close(remote_sk); + return -1; + } + remote_sk = tmp; + } + + pr_info("Cache is connected to Proxy through fd %d\n", remote_sk); + + local_sk = setup_UNIX_server_socket(local_cache_path); + if (local_sk < 0) { + pr_perror("Unable to open cache to proxy UNIX socket"); + close(remote_sk); + return -1; + + } + + if (background) { + if (daemon(1, 0) == -1) { + pr_perror("Can't run service server in the background"); + return -1; + } + } + + accept_image_connections(); + pr_info("Finished image cache."); + return 0; +} diff --git a/criu/img-proxy.c b/criu/img-proxy.c new file mode 100644 index 0000000000..f15bd7c9a0 --- /dev/null +++ b/criu/img-proxy.c @@ -0,0 +1,45 @@ +#include + +#include "cr_options.h" +#include "criu-log.h" +#include "img-remote.h" +#include "util.h" + +int image_proxy(bool background, char *local_proxy_path) +{ + pr_info("CRIU to Proxy Path: %s, Cache Address %s:%u\n", + local_proxy_path, opts.addr, opts.port); + restoring = false; + + local_sk = setup_UNIX_server_socket(local_proxy_path); + if (local_sk < 0) { + pr_perror("Unable to open CRIU to proxy UNIX socket"); + return -1; + } + + if (opts.ps_socket != -1) { + remote_sk = opts.ps_socket; + pr_info("Re-using ps socket %d\n", remote_sk); + } else { + remote_sk = setup_tcp_client(); + if (remote_sk < 0) { + pr_perror("Unable to open proxy to cache TCP socket"); + close(local_sk); + return -1; + } + } + + pr_info("Proxy is connected to Cache through fd %d\n", remote_sk); + + if (background) { + if (daemon(1, 0) == -1) { + pr_perror("Can't run service server in the background"); + return -1; + } + } + + // TODO - local_sk and remote_sk send as args. + accept_image_connections(); + pr_info("Finished image proxy."); + return 0; +} diff --git a/criu/img-remote.c b/criu/img-remote.c new file mode 100644 index 0000000000..f9464e011d --- /dev/null +++ b/criu/img-remote.c @@ -0,0 +1,1164 @@ +#include +#include +#include +#include +#include + +#include "cr_options.h" +#include "img-remote.h" +#include "image.h" +#include "images/remote-image.pb-c.h" +#include "protobuf.h" +#include "servicefd.h" +#include "string.h" +#include "xmalloc.h" + +#define EPOLL_MAX_EVENTS 50 + +#define strflags(f) ((f) == O_RDONLY ? "read" : \ + (f) == O_APPEND ? "append" : "write") + +// List of images already in memory. +static LIST_HEAD(rimg_head); + +// List of local operations currently in-progress. +static LIST_HEAD(rop_inprogress); + +// List of local operations pending (reads on the restore side for images that +// still haven't arrived). +static LIST_HEAD(rop_pending); + +// List of images waiting to be forwarded. The head of the list is currently +// being forwarded. +static LIST_HEAD(rop_forwarding); + +// List of snapshots (useful when doing incremental restores/dumps) +static LIST_HEAD(snapshot_head); + +// Snapshot id (setup at launch time by dump or restore). +static char *snapshot_id; + +// True if restoring (cache := true; proxy := false). +bool restoring = true; + +// True if the proxy to cache socket is being used (receiving or sending). +static bool forwarding = false; + +// True if the local dump or restore is finished. +static bool finished_local = false; + +// True if the communication between the proxy and cache can be closed. +static bool finished_remote = false; + +// Proxy to cache socket fd; Local dump or restore servicing fd. +int remote_sk; +int local_sk; + +// Epoll fd and event array. +static int epoll_fd; +static struct epoll_event *events; + +static int64_t recv_image_async(struct roperation *op); +static int64_t send_image_async(struct roperation *op); + +/* A snapshot is a dump or pre-dump operation. Each snapshot is identified by an + * ID which corresponds to the working directory specified by the user. + */ +struct snapshot { + char snapshot_id[PATH_MAX]; + struct list_head l; +}; + +static struct snapshot *new_snapshot(char *snapshot_id) +{ + struct snapshot *s = xmalloc(sizeof(struct snapshot)); + + if (!s) + return NULL; + + strncpy(s->snapshot_id, snapshot_id, PATH_MAX - 1); + s->snapshot_id[PATH_MAX - 1]= '\0'; + return s; +} + +static inline void add_snapshot(struct snapshot *snapshot) +{ + list_add_tail(&(snapshot->l), &snapshot_head); +} + +struct rimage *get_rimg_by_name(const char *snapshot_id, const char *path) +{ + struct rimage *rimg = NULL; + + list_for_each_entry(rimg, &rimg_head, l) { + if (!strncmp(rimg->path, path, PATH_MAX) && + !strncmp(rimg->snapshot_id, snapshot_id, PATH_MAX)) { + return rimg; + } + } + return NULL; +} + +static inline struct roperation *get_rop_by_name(struct list_head *head, + const char *snapshot_id, const char *path) +{ + struct roperation *rop = NULL; + + list_for_each_entry(rop, head, l) { + if (!strncmp(rop->path, path, PATH_MAX) && + !strncmp(rop->snapshot_id, snapshot_id, PATH_MAX)) { + return rop; + } + } + return NULL; +} + +static int event_set(int epoll_fd, int op, int fd, uint32_t events, void *data) +{ + int ret; + struct epoll_event event; + event.events = events; + event.data.ptr = data; + + ret = epoll_ctl(epoll_fd, op, fd, &event); + if (ret) + pr_perror("[fd=%d] Unable to set event", fd); + return ret; +} + +int setup_UNIX_server_socket(char *path) +{ + struct sockaddr_un addr; + int sockfd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK, 0); + + if (sockfd < 0) { + pr_perror("Unable to open image socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path)-1); + + unlink(path); + + if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { + pr_perror("Unable to bind image socket"); + goto err; + } + + if (listen(sockfd, 50) == -1) { + pr_perror("Unable to listen image socket"); + goto err; + } + + return sockfd; +err: + close(sockfd); + return -1; +} + +static int setup_UNIX_client_socket(char *path) +{ + struct sockaddr_un addr; + int sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + + if (sockfd < 0) { + pr_perror("Unable to open local image socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + strncpy(addr.sun_path, path, sizeof(addr.sun_path)-1); + + if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + pr_perror("Unable to connect to local socket: %s", path); + close(sockfd); + return -1; + } + + return sockfd; +} + +static inline int64_t pb_write_obj(int fd, void *obj, int type) +{ + struct cr_img img; + + img._x.fd = fd; + bfd_setraw(&img._x); + return pb_write_one(&img, obj, type); +} + +static inline int64_t pb_read_obj(int fd, void **pobj, int type) +{ + struct cr_img img; + + img._x.fd = fd; + bfd_setraw(&img._x); + return do_pb_read_one(&img, pobj, type, true); +} + +static inline int64_t write_header(int fd, char *snapshot_id, char *path, + int flags) +{ + LocalImageEntry li = LOCAL_IMAGE_ENTRY__INIT; + + li.name = path; + li.snapshot_id = snapshot_id; + li.open_mode = flags; + return pb_write_obj(fd, &li, PB_LOCAL_IMAGE); +} + +static inline int64_t write_reply_header(int fd, int error) +{ + LocalImageReplyEntry lir = LOCAL_IMAGE_REPLY_ENTRY__INIT; + + lir.error = error; + return pb_write_obj(fd, &lir, PB_LOCAL_IMAGE_REPLY); +} + +static inline int64_t write_remote_header(int fd, char *snapshot_id, + char *path, int flags, uint64_t size) +{ + RemoteImageEntry ri = REMOTE_IMAGE_ENTRY__INIT; + + ri.name = path; + ri.snapshot_id = snapshot_id; + ri.open_mode = flags; + ri.size = size; + return pb_write_obj(fd, &ri, PB_REMOTE_IMAGE); +} + +static inline int64_t read_header(int fd, char *snapshot_id, char *path, + int *flags) +{ + LocalImageEntry *li; + int ret = pb_read_obj(fd, (void **)&li, PB_LOCAL_IMAGE); + + if (ret > 0) { + strncpy(snapshot_id, li->snapshot_id, PATH_MAX - 1); + snapshot_id[PATH_MAX - 1] = 0; + strncpy(path, li->name, PATH_MAX - 1); + path[PATH_MAX - 1] = 0; + *flags = li->open_mode; + } + free(li); + return ret; +} + +static inline int64_t read_reply_header(int fd, int *error) +{ + LocalImageReplyEntry *lir; + int ret = pb_read_obj(fd, (void **)&lir, PB_LOCAL_IMAGE_REPLY); + + if (ret > 0) + *error = lir->error; + free(lir); + return ret; +} + +static inline int64_t read_remote_header(int fd, char *snapshot_id, char *path, + int *flags, uint64_t *size) +{ + RemoteImageEntry *ri; + int ret = pb_read_obj(fd, (void **)&ri, PB_REMOTE_IMAGE); + + if (ret > 0) { + strncpy(snapshot_id, ri->snapshot_id, PATH_MAX - 1); + strncpy(path, ri->name, PATH_MAX - 1); + *flags = ri->open_mode; + *size = ri->size; + } + free(ri); + return ret; +} + +static struct rimage *new_remote_image(char *path, char *snapshot_id) +{ + struct rimage *rimg = xzalloc(sizeof(struct rimage)); + struct rbuf *buf = xzalloc(sizeof(struct rbuf)); + + if (rimg == NULL || buf == NULL) + goto err; + + strncpy(rimg->path, path, PATH_MAX -1 ); + strncpy(rimg->snapshot_id, snapshot_id, PATH_MAX - 1); + rimg->path[PATH_MAX - 1] = '\0'; + rimg->snapshot_id[PATH_MAX - 1] = '\0'; + INIT_LIST_HEAD(&(rimg->buf_head)); + list_add_tail(&(buf->l), &(rimg->buf_head)); + rimg->curr_fwd_buf = buf; + + return rimg; +err: + xfree(rimg); + xfree(buf); + return NULL; +} + +static struct roperation *new_remote_operation(char *path, + char *snapshot_id, int cli_fd, int flags, bool close_fd) +{ + struct roperation *rop = xzalloc(sizeof(struct roperation)); + + if (rop == NULL) + return NULL; + + strncpy(rop->path, path, PATH_MAX -1 ); + strncpy(rop->snapshot_id, snapshot_id, PATH_MAX - 1); + rop->path[PATH_MAX - 1] = '\0'; + rop->snapshot_id[PATH_MAX - 1] = '\0'; + rop->fd = cli_fd; + rop->flags = flags; + rop->close_fd = close_fd; + + return rop; +} + +static inline void rop_set_rimg(struct roperation *rop, struct rimage *rimg) +{ + rop->rimg = rimg; + rop->size = rimg->size; + if (rop->flags == O_APPEND) { + // Image forward on append must start where the last fwd finished. + if (rop->fd == remote_sk) { + rop->curr_sent_buf = rimg->curr_fwd_buf; + rop->curr_sent_bytes = rimg->curr_fwd_bytes; + } else { + // For local appends, just write at the end. + rop->curr_sent_buf = list_entry(rimg->buf_head.prev, struct rbuf, l); + rop->curr_sent_bytes = rop->curr_sent_buf->nbytes; + } + // On the receiver size, we just append + rop->curr_recv_buf = list_entry(rimg->buf_head.prev, struct rbuf, l); + } else { + // Writes or reads are simple. Just do it from the beginning. + rop->curr_recv_buf = list_entry(rimg->buf_head.next, struct rbuf, l); + rop->curr_sent_buf = list_entry(rimg->buf_head.next, struct rbuf, l); + rop->curr_sent_bytes = 0; + } +} + +/* Clears a remote image struct for reusing it. */ +static inline struct rimage *clear_remote_image(struct rimage *rimg) +{ + while (!list_is_singular(&(rimg->buf_head))) { + struct rbuf *buf = list_entry(rimg->buf_head.prev, struct rbuf, l); + + list_del(rimg->buf_head.prev); + xfree(buf); + } + + list_entry(rimg->buf_head.next, struct rbuf, l)->nbytes = 0; + rimg->size = 0; + + return rimg; +} + +static struct roperation *handle_accept_write(int cli_fd, char *snapshot_id, + char *path, int flags, bool close_fd, uint64_t size) +{ + struct roperation *rop = NULL; + struct rimage *rimg = get_rimg_by_name(snapshot_id, path); + + if (rimg == NULL) { + rimg = new_remote_image(path, snapshot_id); + if (rimg == NULL) { + pr_perror("Error preparing remote image"); + goto err; + } + } else { + list_del(&(rimg->l)); + if (flags == O_APPEND) + clear_remote_image(rimg); + } + + rop = new_remote_operation(path, snapshot_id, cli_fd, flags, close_fd); + if (rop == NULL) { + pr_perror("Error preparing remote operation"); + goto err; + } + + rop_set_rimg(rop, rimg); + rop->size = size; + return rop; +err: + xfree(rimg); + xfree(rop); + return NULL; +} + +static inline struct roperation *handle_accept_proxy_write(int cli_fd, + char *snapshot_id, char *path, int flags) +{ + return handle_accept_write(cli_fd, snapshot_id, path, flags, true, 0); +} + +static struct roperation *handle_accept_proxy_read(int cli_fd, + char *snapshot_id, char *path, int flags) +{ + struct roperation *rop = NULL; + struct rimage *rimg = NULL; + + rimg = get_rimg_by_name(snapshot_id, path); + + // Check if we already have the image. + if (rimg == NULL) { + pr_info("No image %s:%s.\n", path, snapshot_id); + if (write_reply_header(cli_fd, ENOENT) < 0) { + pr_perror("Error writing reply header for unexisting image"); + goto err; + } + close(cli_fd); + return NULL; + } + + if (write_reply_header(cli_fd, 0) < 0) { + pr_perror("Error writing reply header for %s:%s", + path, snapshot_id); + goto err; + } + + rop = new_remote_operation(path, snapshot_id, cli_fd, flags, true); + if (rop == NULL) { + pr_perror("Error preparing remote operation"); + goto err; + } + + rop_set_rimg(rop, rimg); + return rop; +err: + close(cli_fd); + return NULL; +} + +static inline void finish_local(void) +{ + int ret; + finished_local = true; + ret = event_set(epoll_fd, EPOLL_CTL_DEL, local_sk, 0, 0); + if (ret) { + pr_perror("Failed to del local fd from epoll"); + } +} + +static struct roperation *handle_accept_cache_read(int cli_fd, + char *snapshot_id, char *path, int flags) +{ + struct rimage *rimg = NULL; + struct roperation *rop = NULL; + + rop = new_remote_operation(path, snapshot_id, cli_fd, flags, true); + if (rop == NULL) { + pr_perror("Error preparing remote operation"); + close(cli_fd); + return NULL; + } + + // Check if we already have the image. + rimg = get_rimg_by_name(snapshot_id, path); + if (rimg != NULL && rimg->size > 0) { + if (write_reply_header(cli_fd, 0) < 0) { + pr_perror("Error writing reply header for %s:%s", + path, snapshot_id); + close(rop->fd); + xfree(rop); + } + rop_set_rimg(rop, rimg); + return rop; + } else if (finished_remote) { + // The file does not exist. + pr_info("No image %s:%s.\n", path, snapshot_id); + if (write_reply_header(cli_fd, ENOENT) < 0) + pr_perror("Error writing reply header for unexisting image"); + close(cli_fd); + xfree(rop); + } + return NULL; +} + +static void forward_remote_image(struct roperation *rop) +{ + int64_t ret = 0; + + // Set blocking during the setup. + fd_set_nonblocking(rop->fd, false); + + ret = write_remote_header( + rop->fd, rop->snapshot_id, rop->path, rop->flags, rop->size); + + if (ret < 0) { + pr_perror("Error writing header for %s:%s", + rop->path, rop->snapshot_id); + return; + } + + pr_info("[fd=%d] Forwarding %s request for %s:%s (%" PRIu64 " bytes\n", + rop->fd, strflags(rop->flags), rop->path, rop->snapshot_id, + rop->size); + + // Go back to non-blocking + fd_set_nonblocking(rop->fd, true); + + forwarding = true; + event_set(epoll_fd, EPOLL_CTL_ADD, rop->fd, EPOLLOUT, rop); +} + +static void handle_remote_accept(int fd) +{ + char path[PATH_MAX]; + char snapshot_id[PATH_MAX]; + int flags = 0; + uint64_t size = 0; + int64_t ret; + struct roperation* rop = NULL; + + // Set blocking during the setup. + fd_set_nonblocking(fd, false); + + ret = read_remote_header(fd, snapshot_id, path, &flags, &size); + if (ret < 0) { + pr_perror("Unable to receive remote header from image proxy"); + goto err; + } + /* This means that the no more images are coming. */ + else if (!ret) { + finished_remote = true; + pr_info("Image Proxy connection closed.\n"); + return; + } + + // Go back to non-blocking + fd_set_nonblocking(fd, true); + + pr_info("[fd=%d] Received %s request for %s:%s with %" PRIu64 " bytes\n", + fd, strflags(flags), path, snapshot_id, size); + + + forwarding = true; + rop = handle_accept_write(fd, snapshot_id, path, flags, false, size); + + if (rop != NULL) { + list_add_tail(&(rop->l), &rop_inprogress); + event_set(epoll_fd, EPOLL_CTL_ADD, rop->fd, EPOLLIN, rop); + } + return; +err: + close(fd); +} + +static void handle_local_accept(int fd) +{ + int cli_fd; + char path[PATH_MAX]; + char snapshot_id[PATH_MAX]; + int flags = 0; + struct sockaddr_in cli_addr; + socklen_t clilen = sizeof(cli_addr); + struct roperation *rop = NULL; + + cli_fd = accept(fd, (struct sockaddr *) &cli_addr, &clilen); + if (cli_fd < 0) { + pr_perror("Unable to accept local image connection"); + return; + } + + if (read_header(cli_fd, snapshot_id, path, &flags) < 0) { + pr_err("Error reading local image header\n"); + goto err; + } + + if (snapshot_id[0] == NULL_SNAPSHOT_ID && path[0] == FINISH) { + close(cli_fd); + finish_local(); + return; + } + + pr_info("[fd=%d] Received %s request for %s:%s\n", + cli_fd, strflags(flags), path, snapshot_id); + + // Write/Append case (only possible in img-proxy). + if (flags != O_RDONLY) { + rop = handle_accept_proxy_write(cli_fd, snapshot_id, path, flags); + } else if (restoring) { + // Read case while restoring (img-cache). + rop = handle_accept_cache_read(cli_fd, snapshot_id, path, flags); + } else { + // Read case while dumping (img-proxy). + rop = handle_accept_proxy_read(cli_fd, snapshot_id, path, flags); + } + + // If we have an operation. Check if we are ready to start or not. + if (rop != NULL) { + if (rop->rimg != NULL) { + list_add_tail(&(rop->l), &rop_inprogress); + event_set( + epoll_fd, + EPOLL_CTL_ADD, + rop->fd, + rop->flags == O_RDONLY ? EPOLLOUT : EPOLLIN, + rop); + } else { + list_add_tail(&(rop->l), &rop_pending); + } + fd_set_nonblocking(rop->fd, false); + } + + return; +err: + close(cli_fd); +} + +static inline void finish_proxy_read(struct roperation *rop) +{ + // If finished forwarding image + if (rop->fd == remote_sk) { + // Update fwd buffer and byte count on rimg. + rop->rimg->curr_fwd_buf = rop->curr_sent_buf; + rop->rimg->curr_fwd_bytes = rop->curr_sent_bytes; + + forwarding = false; + + // If there are images waiting to be forwarded, forward the next. + if (!list_empty(&rop_forwarding)) { + forward_remote_image(list_entry(rop_forwarding.next, struct roperation, l)); + } + } +} + +static inline void finish_proxy_write(struct roperation *rop) +{ + // Normal image received, forward it. + struct roperation *rop_to_forward = new_remote_operation( + rop->path, rop->snapshot_id, remote_sk, rop->flags, false); + + // Add image to list of images. + list_add_tail(&(rop->rimg->l), &rimg_head); + + rop_set_rimg(rop_to_forward, rop->rimg); + if (list_empty(&rop_forwarding)) { + forward_remote_image(rop_to_forward); + } + list_add_tail(&(rop_to_forward->l), &rop_forwarding); +} + +static void finish_cache_write(struct roperation *rop) +{ + struct roperation *prop = get_rop_by_name( + &rop_pending, rop->snapshot_id, rop->path); + + forwarding = false; + event_set(epoll_fd, EPOLL_CTL_ADD, remote_sk, EPOLLIN, &remote_sk); + + // Add image to list of images. + list_add_tail(&(rop->rimg->l), &rimg_head); + + if (prop != NULL) { + pr_info("\t[fd=%d] Resuming pending %s for %s:%s\n", + prop->fd, strflags(prop->flags), + prop->snapshot_id, prop->path); + + // Write header for pending image. + if (write_reply_header(prop->fd, 0) < 0) { + pr_perror("Error writing reply header for %s:%s", + prop->path, prop->snapshot_id); + close(prop->fd); + xfree(prop); + return; + } + + rop_set_rimg(prop, rop->rimg); + list_del(&(prop->l)); + list_add_tail(&(prop->l), &rop_inprogress); + event_set(epoll_fd, EPOLL_CTL_ADD, prop->fd, EPOLLOUT, prop); + } +} + +static void handle_roperation(struct epoll_event *event, + struct roperation *rop) +{ + int64_t ret = (EPOLLOUT & event->events) ? + send_image_async(rop) : + recv_image_async(rop); + + if (ret > 0 || ret == EAGAIN || ret == EWOULDBLOCK) { + event_set( + epoll_fd, + EPOLL_CTL_ADD, + rop->fd, + event->events, + rop); + return; + } + + // Remove rop from list (either in progress or forwarding). + list_del(&(rop->l)); + + // Operation is finished. + if (ret < 0) { + pr_perror("Unable to %s %s:%s (returned %" PRId64 ")", + event->events & EPOLLOUT ? "send" : "receive", + rop->rimg->path, rop->rimg->snapshot_id, ret); + goto err; + } else { + pr_info("[fd=%d] Finished %s %s:%s to CRIU (size %" PRIu64 ")\n", + rop->fd, + event->events & EPOLLOUT ? "sending" : "receiving", + rop->rimg->path, rop->rimg->snapshot_id, rop->rimg->size); + } + + // If receive operation is finished + if (event->events & EPOLLIN) { + // Cached side (finished receiving forwarded image) + if (restoring) { + finish_cache_write(rop); + } else { + // Proxy side (finished receiving local image) + finish_proxy_write(rop); + } + } else { + // Proxy side (Finished forwarding image or reading it locally). + if (!restoring) + finish_proxy_read(rop); + // Nothing to be done when a read is finished on the cache side. + } +err: + xfree(rop); +} + +static void check_pending(void) +{ + struct roperation *rop = NULL; + struct rimage *rimg = NULL; + + list_for_each_entry(rop, &rop_pending, l) { + rimg = get_rimg_by_name(rop->snapshot_id, rop->path); + if (rimg != NULL) { + rop_set_rimg(rop, rimg); + if (restoring) { + event_set(epoll_fd, EPOLL_CTL_ADD, rop->fd, EPOLLOUT, rop); + } else { + forward_remote_image(rop); + return; + } + } + } +} + +void accept_image_connections(void) { + int ret; + + epoll_fd = epoll_create(EPOLL_MAX_EVENTS); + if (epoll_fd < 0) { + pr_perror("Unable to open epoll"); + return; + } + + events = calloc(EPOLL_MAX_EVENTS, sizeof(struct epoll_event)); + if (events == NULL) { + pr_perror("Failed to allocated epoll events"); + goto end; + } + + ret = event_set(epoll_fd, EPOLL_CTL_ADD, local_sk, EPOLLIN, &local_sk); + if (ret) { + pr_perror("Failed to add local fd to epoll"); + goto end; + } + + // Only if we are restoring (cache-side) we need to add the remote sock to + // the epoll. + if (restoring) { + ret = event_set(epoll_fd, EPOLL_CTL_ADD, remote_sk, + EPOLLIN, &remote_sk); + if (ret) { + pr_perror("Failed to add proxy to cache fd to epoll"); + goto end; + } + } + + while (1) { + int n_events, i; + + n_events = epoll_wait(epoll_fd, events, EPOLL_MAX_EVENTS, 250); + + /* epoll_wait isn't restarted after interrupted by a signal */ + if (n_events < 0 && errno != EINTR) { + pr_perror("Failed to epoll wait"); + goto end; + } + + for (i = 0; i < n_events; i++) { + // Accept from local dump/restore? + if (events[i].data.ptr == &local_sk) { + if (events[i].events & EPOLLHUP || + events[i].events & EPOLLERR) { + if (!finished_local) + pr_perror("Unable to accept more local image connections"); + goto end; + } + handle_local_accept(local_sk); + } else if (restoring && !forwarding && events[i].data.ptr == &remote_sk) { + event_set(epoll_fd, EPOLL_CTL_DEL, remote_sk, 0, 0); + handle_remote_accept(remote_sk); + } else { + struct roperation *rop = + (struct roperation*)events[i].data.ptr; + event_set(epoll_fd, EPOLL_CTL_DEL, rop->fd, 0, 0); + handle_roperation(&events[i], rop); + } + } + + // Check if there are any pending operations + if (restoring || !forwarding) + check_pending(); + + // Check if we can close the tcp socket (this will unblock the cache + // to answer "no image" to restore). + if (!restoring && + finished_local && + !finished_remote && + list_empty(&rop_forwarding)) { + close(remote_sk); + finished_remote = true; + } + + // If both local and remote sockets are closed, leave. + if (finished_local && finished_remote) { + pr_info("Finished both local and remote, exiting\n"); + goto end; + } + } +end: + close(epoll_fd); + close(local_sk); + free(events); +} + + +/* Note: size is a limit on how much we want to read from the socket. Zero means + * read until the socket is closed. + */ +static int64_t recv_image_async(struct roperation *op) +{ + int fd = op->fd; + struct rimage *rimg = op->rimg; + uint64_t size = op->size; + bool close_fd = op->close_fd; + struct rbuf *curr_buf = op->curr_recv_buf; + int n; + + n = read(fd, + curr_buf->buffer + curr_buf->nbytes, + size ? + min((int) (size - rimg->size), BUF_SIZE - curr_buf->nbytes) : + BUF_SIZE - curr_buf->nbytes); + if (n == 0) { + if (close_fd) + close(fd); + return n; + } else if (n > 0) { + curr_buf->nbytes += n; + rimg->size += n; + if (curr_buf->nbytes == BUF_SIZE) { + struct rbuf *buf = xmalloc(sizeof(struct rbuf)); + if (buf == NULL) { + if (close_fd) + close(fd); + return -1; + } + buf->nbytes = 0; + list_add_tail(&(buf->l), &(rimg->buf_head)); + op->curr_recv_buf = buf; + return n; + } + if (size && rimg->size == size) { + if (close_fd) + close(fd); + return 0; + } + } else if (errno == EAGAIN || errno == EWOULDBLOCK) { + return errno; + } else { + pr_perror("Read for %s:%s socket on fd=%d failed", + rimg->path, rimg->snapshot_id, fd); + if (close_fd) + close(fd); + return -1; + } + return n; +} + +static int64_t send_image_async(struct roperation *op) +{ + int fd = op->fd; + struct rimage *rimg = op->rimg; + bool close_fd = op->close_fd; + int n; + + n = write( + fd, + op->curr_sent_buf->buffer + op->curr_sent_bytes, + min(BUF_SIZE, op->curr_sent_buf->nbytes) - op->curr_sent_bytes); + + if (n > -1) { + op->curr_sent_bytes += n; + if (op->curr_sent_bytes == BUF_SIZE) { + op->curr_sent_buf = + list_entry(op->curr_sent_buf->l.next, struct rbuf, l); + op->curr_sent_bytes = 0; + return n; + } else if (op->curr_sent_bytes == op->curr_sent_buf->nbytes) { + if (close_fd) + close(fd); + return 0; + } + return n; + } else if (errno == EPIPE || errno == ECONNRESET) { + pr_warn("Connection for %s:%s was closed early than expected\n", + rimg->path, rimg->snapshot_id); + return 0; + } else if (errno == EAGAIN || errno == EWOULDBLOCK) { + return errno; + } else { + pr_perror("Write on %s:%s socket failed", + rimg->path, rimg->snapshot_id); + return -1; + } +} + +int read_remote_image_connection(char *snapshot_id, char *path) +{ + int error = 0; + int sockfd = setup_UNIX_client_socket(restoring ? DEFAULT_CACHE_SOCKET: DEFAULT_PROXY_SOCKET); + + if (sockfd < 0) { + pr_err("Error opening local connection for %s:%s\n", + path, snapshot_id); + return -1; + } + + if (write_header(sockfd, snapshot_id, path, O_RDONLY) < 0) { + pr_err("Error writing header for %s:%s\n", path, snapshot_id); + return -1; + } + + if (read_reply_header(sockfd, &error) < 0) { + pr_err("Error reading reply header for %s:%s\n", + path, snapshot_id); + return -1; + } + + if (!error || (snapshot_id[0] == NULL_SNAPSHOT_ID && path[0] != FINISH)) + return sockfd; + + if (error == ENOENT) { + pr_info("Image does not exist (%s:%s)\n", path, snapshot_id); + close(sockfd); + return -ENOENT; + } + pr_err("Unexpected error returned: %d (%s:%s)\n", + error, path, snapshot_id); + close(sockfd); + return -1; +} + +int write_remote_image_connection(char *snapshot_id, char *path, int flags) +{ + int sockfd = setup_UNIX_client_socket(DEFAULT_PROXY_SOCKET); + + if (sockfd < 0) + return -1; + + if (write_header(sockfd, snapshot_id, path, flags) < 0) { + pr_err("Error writing header for %s:%s\n", path, snapshot_id); + return -1; + } + return sockfd; +} + +int finish_remote_dump(void) +{ + int fd; + pr_info("Dump side is calling finish\n"); + + fd = write_remote_image_connection(NULL_SNAPSHOT_ID, FINISH, O_WRONLY); + if (fd == -1) { + pr_err("Unable to open finish dump connection"); + return -1; + } + + close(fd); + return 0; +} + +int finish_remote_restore(void) +{ + int fd; + pr_info("Restore side is calling finish\n"); + + fd = read_remote_image_connection(NULL_SNAPSHOT_ID, FINISH); + if (fd == -1) { + pr_err("Unable to open finish restore connection\n"); + return -1; + } + + close(fd); + return 0; +} + +int skip_remote_bytes(int fd, unsigned long len) +{ + static char buf[4096]; + int n = 0; + unsigned long curr = 0; + + for (; curr < len; ) { + n = read(fd, buf, min(len - curr, (unsigned long)4096)); + if (n == 0) { + pr_perror("Unexpected end of stream (skipping %lx/%lx bytes)", + curr, len); + return -1; + } else if (n > 0) { + curr += n; + } else { + pr_perror("Error while skipping bytes from stream (%lx/%lx)", + curr, len); + return -1; + } + } + + if (curr != len) { + pr_err("Unable to skip the current number of bytes: %lx instead of %lx\n", + curr, len); + return -1; + } + return 0; +} + +static int pull_snapshot_ids(void) +{ + int n, sockfd; + SnapshotIdEntry *ls; + struct snapshot *s = NULL; + + sockfd = read_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG); + + /* The connection was successful but there is not file. */ + if (sockfd < 0) { + if (errno != ENOENT) { + pr_err("Unable to open snapshot id read connection\n"); + return -1; + } + return 0; + } + + while (1) { + n = pb_read_obj(sockfd, (void **)&ls, PB_SNAPSHOT_ID); + if (!n) { + close(sockfd); + return n; + } else if (n < 0) { + pr_err("Unable to read remote snapshot ids\n"); + close(sockfd); + return n; + } + + s = new_snapshot(ls->snapshot_id); + if (!s) { + close(sockfd); + return -1; + } + add_snapshot(s); + pr_info("[read_snapshot ids] parent = %s\n", ls->snapshot_id); + } + free(ls); + close(sockfd); + return n; +} + +int push_snapshot_id(void) +{ + int n; + SnapshotIdEntry rn = SNAPSHOT_ID_ENTRY__INIT; + int sockfd; + + restoring = false; + + sockfd = write_remote_image_connection(NULL_SNAPSHOT_ID, PARENT_IMG, O_APPEND); + if (sockfd < 0) { + pr_err("Unable to open snapshot id push connection\n"); + return -1; + } + + rn.snapshot_id = xmalloc(sizeof(char) * PATH_MAX); + if (!rn.snapshot_id) { + close(sockfd); + return -1; + } + strlcpy(rn.snapshot_id, snapshot_id, PATH_MAX); + + n = pb_write_obj(sockfd, &rn, PB_SNAPSHOT_ID); + + xfree(rn.snapshot_id); + close(sockfd); + return n; +} + +void init_snapshot_id(char *si) +{ + snapshot_id = si; +} + +char *get_curr_snapshot_id(void) +{ + return snapshot_id; +} + +int get_curr_snapshot_id_idx(void) +{ + struct snapshot *si; + int idx = 0; + + if (list_empty(&snapshot_head)) + pull_snapshot_ids(); + + list_for_each_entry(si, &snapshot_head, l) { + if (!strncmp(si->snapshot_id, snapshot_id, PATH_MAX)) + return idx; + idx++; + } + + pr_err("Error, could not find current snapshot id (%s) fd\n", + snapshot_id); + return -1; +} + +char *get_snapshot_id_from_idx(int idx) +{ + struct snapshot *si; + + if (list_empty(&snapshot_head)) + pull_snapshot_ids(); + + /* Note: if idx is the service fd then we need the current + * snapshot_id idx. Else we need a parent snapshot_id idx. + */ + if (idx == get_service_fd(IMG_FD_OFF)) + idx = get_curr_snapshot_id_idx(); + + list_for_each_entry(si, &snapshot_head, l) { + if (!idx) + return si->snapshot_id; + idx--; + } + + pr_err("Error, could not find snapshot id for idx %d\n", idx); + return NULL; +} + +int get_curr_parent_snapshot_id_idx(void) +{ + return get_curr_snapshot_id_idx() - 1; +} diff --git a/criu/include/clone-noasan.h b/criu/include/clone-noasan.h index 8ef75fa736..0cfdaa1d9a 100644 --- a/criu/include/clone-noasan.h +++ b/criu/include/clone-noasan.h @@ -2,5 +2,7 @@ #define __CR_CLONE_NOASAN_H__ int clone_noasan(int (*fn)(void *), int flags, void *arg); +int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags, + int exit_signal, pid_t pid); #endif /* __CR_CLONE_NOASAN_H__ */ diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index 82f76ad948..e02848d2d0 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -38,6 +38,12 @@ struct cg_root_opt { char *newroot; }; +/* + * Pre-dump variants + */ +#define PRE_DUMP_SPLICE 1 /* Pre-dump using parasite */ +#define PRE_DUMP_READ 2 /* Pre-dump using process_vm_readv syscall */ + /* * Cgroup management options. */ @@ -81,6 +87,7 @@ struct cr_options { int evasive_devices; int link_remap_ok; int log_file_per_pid; + int pre_dump_mode; bool swrk_restore; char *output; char *root; @@ -106,6 +113,7 @@ struct cr_options { char *cgroup_props; char *cgroup_props_file; struct list_head new_cgroup_roots; + char *cgroup_yard; bool autodetect_ext_mounts; int enable_external_sharing; int enable_external_masters; @@ -135,6 +143,7 @@ struct cr_options { int weak_sysctls; int status_fd; bool orphan_pts_master; + int remote; pid_t tree_id; int log_level; char *imgs_dir; @@ -147,10 +156,10 @@ struct cr_options { }; extern struct cr_options opts; -char *rpc_cfg_file; +extern char *rpc_cfg_file; extern int parse_options(int argc, char **argv, bool *usage_error, bool *has_exec_cmd, int state); -extern int check_options(); -extern void init_opts(); +extern int check_options(void); +extern void init_opts(void); #endif /* __CR_OPTIONS_H__ */ diff --git a/criu/include/criu-log.h b/criu/include/criu-log.h index c2a635ba76..21ef543079 100644 --- a/criu/include/criu-log.h +++ b/criu/include/criu-log.h @@ -22,6 +22,8 @@ #include "log.h" +struct timeval; + extern int log_init(const char *output); extern void log_fini(void); extern int log_init_by_pid(pid_t pid); diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h index 852d271668..31fe161784 100644 --- a/criu/include/fault-injection.h +++ b/criu/include/fault-injection.h @@ -17,6 +17,7 @@ enum faults { FI_NO_BREAKPOINTS = 130, FI_PARTIAL_PAGES = 131, FI_HUGE_ANON_SHMEM_ID = 132, + FI_CANNOT_MAP_VDSO = 133, FI_MAX, }; diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h index d9c5c5e7b3..ea9d48c72f 100644 --- a/criu/include/fcntl.h +++ b/criu/include/fcntl.h @@ -34,6 +34,14 @@ struct f_owner_ex { # define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8) #endif +#ifndef F_ADD_SEALS +# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#endif + +#ifndef F_GET_SEALS +# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) +#endif + #ifndef O_PATH # define O_PATH 010000000 #endif diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h index 7a22d4d829..016d76a9fc 100644 --- a/criu/include/files-reg.h +++ b/criu/include/files-reg.h @@ -30,7 +30,6 @@ extern int open_reg_by_id(u32 id); extern int open_reg_fd(struct file_desc *); extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd, struct reg_file_info *, void *), void *arg); -extern void clear_ghost_files(void); extern const struct fdtype_ops regfile_dump_ops; extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg); diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h index 3135f56b4d..9ca9643a1c 100644 --- a/criu/include/image-desc.h +++ b/criu/include/image-desc.h @@ -42,6 +42,7 @@ enum { CR_FD_RULE, CR_FD_IPTABLES, CR_FD_IP6TABLES, + CR_FD_NFTABLES, CR_FD_NETNS, CR_FD_NETNF_CT, CR_FD_NETNF_EXP, @@ -79,7 +80,6 @@ enum { CR_FD_RLIMIT, CR_FD_ITIMERS, CR_FD_POSIX_TIMERS, - CR_FD_FILE_LOCKS_PID, CR_FD_IRMAP_CACHE, CR_FD_CPUINFO, @@ -106,6 +106,8 @@ enum { CR_FD_FIFO, CR_FD_PIPES, CR_FD_TTY_FILES, + CR_FD_MEMFD_FILE, + CR_FD_MEMFD_INODE, CR_FD_AUTOFS, diff --git a/criu/include/image.h b/criu/include/image.h index 2baa394960..1c7cc54718 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -83,6 +83,7 @@ #define VMA_AREA_SOCKET (1 << 11) #define VMA_AREA_VVAR (1 << 12) #define VMA_AREA_AIORING (1 << 13) +#define VMA_AREA_MEMFD (1 << 14) #define VMA_CLOSE (1 << 28) #define VMA_NO_PROT_WRITE (1 << 29) diff --git a/criu/include/img-remote.h b/criu/include/img-remote.h new file mode 100644 index 0000000000..1d2dd615c8 --- /dev/null +++ b/criu/include/img-remote.h @@ -0,0 +1,146 @@ +#include +#include + +#include +#include "common/list.h" +#include +#include + +#ifndef IMAGE_REMOTE_H +#define IMAGE_REMOTE_H + +#define FINISH 0 +#define PARENT_IMG "parent" +#define NULL_SNAPSHOT_ID 0 +#define DEFAULT_CACHE_SOCKET "img-cache.sock" +#define DEFAULT_PROXY_SOCKET "img-proxy.sock" + +#define DEFAULT_LISTEN 50 +#define BUF_SIZE 4096 + +struct rbuf { + char buffer[BUF_SIZE]; + int nbytes; /* How many bytes are in the buffer. */ + struct list_head l; +}; + +struct rimage { + /* Path and snapshot id identify the image. */ + char path[PATH_MAX]; + char snapshot_id[PATH_MAX]; + /* List anchor. */ + struct list_head l; + /* List of buffers that compose the image. */ + struct list_head buf_head; + /* Number of bytes. */ + uint64_t size; + /* Note: forward (send) operation only. Buffer to start forwarding. */ + struct rbuf *curr_fwd_buf; + /* Note: forward (send) operation only. Number of fwd bytes in 'curr_fw_buf'. */ + uint64_t curr_fwd_bytes; +}; + +/* Structure that describes the state of a remote operation on remote images. */ +struct roperation { + /* List anchor. */ + struct list_head l; + /* File descriptor being used. */ + int fd; + /* Path and snapshot id identify the required image. */ + char path[PATH_MAX]; + char snapshot_id[PATH_MAX]; + /* Remote image being used (may be null if the operation is pending). */ + struct rimage *rimg; + /* Flags for the operation. */ + int flags; + /* If fd should be closed when the operation is done. */ + bool close_fd; + /* Note: recv operation only. How much bytes should be received. */ + uint64_t size; + /* Note: recv operation only. Buffer being written. */ + struct rbuf *curr_recv_buf; // TODO - needed? Could be replaced by list.last! + /* Note: send operation only. Pointer to buffer being sent. */ + struct rbuf *curr_sent_buf; + /* Note: send operation only. Number of bytes sent in 'curr_send_buf. */ + uint64_t curr_sent_bytes; +}; + +/* This is the proxy to cache TCP socket FD. */ +extern int remote_sk; +/* This the unix socket used to fulfill local requests. */ +extern int local_sk; +/* True if we are running the cache/restore, false if proxy/dump. */ +extern bool restoring; + +void accept_image_connections(void); +struct rimage *get_rimg_by_name(const char *snapshot_id, const char *path); + +int setup_UNIX_server_socket(char *path); + +/* Called by restore to get the fd correspondent to a particular path. This call + * will block until the connection is received. + */ +int read_remote_image_connection(char *snapshot_id, char *path); + +/* Called by dump to create a socket connection to the restore side. The socket + * fd is returned for further writing operations. + */ +int write_remote_image_connection(char *snapshot_id, char *path, int flags); + +/* Called by dump/restore when everything is dumped/restored. This function + * creates a new connection with a special control name. The receiver side uses + * it to ack that no more files are coming. + */ +int finish_remote_dump(void); +int finish_remote_restore(void); + +/* Starts an image proxy daemon (dump side). It receives image files through + * socket connections and forwards them to the image cache (restore side). + */ +int image_proxy(bool background, char *local_proxy_path); + +/* Starts an image cache daemon (restore side). It receives image files through + * socket connections and caches them until they are requested by the restore + * process. + */ +int image_cache(bool background, char *local_cache_path); + +/* Reads (discards) 'len' bytes from fd. This is used to emulate the function + * lseek, which is used to advance the file needle. + */ +int skip_remote_bytes(int fd, unsigned long len); + +/* To support iterative migration, the concept of snapshot_id is introduced + * (only when remote migration is enabled). Each image is tagged with one + * snapshot_id. The snapshot_id is the image directory used for the operation + * that creates the image (either predump or dump). Images stored in memory + * (both in Image Proxy and Image Cache) are identified by their name and + * snapshot_id. Snapshot_ids are ordered so that we can find parent pagemaps + * (that will be used when restoring the process). + */ + +/* Sets the current snapshot_id */ +void init_snapshot_id(char *ns); + +/* Returns the current snapshot_id. */ +char *get_curr_snapshot_id(void); + +/* Returns the snapshot_id index representing the current snapshot_id. This + * index represents the hierarchy position. For example: images tagged with + * the snapshot_id with index 1 are more recent than the images tagged with + * the snapshot_id with index 0. + */ +int get_curr_snapshot_id_idx(void); + +/* Returns the snapshot_id associated with the snapshot_id index. */ +char *get_snapshot_id_from_idx(int idx); + +/* Pushes the current snapshot_id into the snapshot_id hierarchy (into the Image + * Proxy and Image Cache). + */ +int push_snapshot_id(void); + +/* Returns the snapshot id index that precedes the current snapshot_id. */ +int get_curr_parent_snapshot_id_idx(void); + +#endif diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h index d93e07813f..27c870bb86 100644 --- a/criu/include/kerndat.h +++ b/criu/include/kerndat.h @@ -65,6 +65,8 @@ struct kerndat_s { bool x86_has_ptrace_fpu_xsave_bug; bool has_inotify_setnextwd; bool has_kcmp_epoll_tfd; + bool has_fsopen; + bool has_clone3_set_tid; }; extern struct kerndat_s kdat; diff --git a/criu/include/linux/mount.h b/criu/include/linux/mount.h new file mode 100644 index 0000000000..aa6be69ec6 --- /dev/null +++ b/criu/include/linux/mount.h @@ -0,0 +1,35 @@ +#ifndef _CRIU_LINUX_MOUNT_H +#define _CRIU_LINUX_MOUNT_H + +#include "common/config.h" +#include "compel/plugins/std/syscall-codes.h" + +#ifdef CONFIG_HAS_FSCONFIG +#include +#else +enum fsconfig_command { + FSCONFIG_SET_FLAG = 0, /* Set parameter, supplying no value */ + FSCONFIG_SET_STRING = 1, /* Set parameter, supplying a string value */ + FSCONFIG_SET_BINARY = 2, /* Set parameter, supplying a binary blob value */ + FSCONFIG_SET_PATH = 3, /* Set parameter, supplying an object by path */ + FSCONFIG_SET_PATH_EMPTY = 4, /* Set parameter, supplying an object by (empty) path */ + FSCONFIG_SET_FD = 5, /* Set parameter, supplying an object by fd */ + FSCONFIG_CMD_CREATE = 6, /* Invoke superblock creation */ + FSCONFIG_CMD_RECONFIGURE = 7, /* Invoke superblock reconfiguration */ +}; +#endif + +static inline int sys_fsopen(const char *fsname, unsigned int flags) +{ + return syscall(__NR_fsopen, fsname, flags); +} +static inline int sys_fsconfig(int fd, unsigned int cmd, const char *key, const char *value, int aux) +{ + return syscall(__NR_fsconfig, fd, cmd, key, value, aux); +} +static inline int sys_fsmount(int fd, unsigned int flags, unsigned int attr_flags) +{ + return syscall(__NR_fsmount, fd, flags, attr_flags); +} + +#endif diff --git a/criu/include/lsm.h b/criu/include/lsm.h index 3b82712829..a41915a4c5 100644 --- a/criu/include/lsm.h +++ b/criu/include/lsm.h @@ -39,7 +39,7 @@ extern int lsm_check_opts(void); #ifdef CONFIG_HAS_SELINUX int dump_xattr_security_selinux(int fd, FdinfoEntry *e); int run_setsockcreatecon(FdinfoEntry *e); -int reset_setsockcreatecon(); +int reset_setsockcreatecon(void); #else static inline int dump_xattr_security_selinux(int fd, FdinfoEntry *e) { return 0; @@ -47,7 +47,7 @@ static inline int dump_xattr_security_selinux(int fd, FdinfoEntry *e) { static inline int run_setsockcreatecon(FdinfoEntry *e) { return 0; } -static inline int reset_setsockcreatecon() { +static inline int reset_setsockcreatecon(void) { return 0; } #endif diff --git a/criu/include/magic.h b/criu/include/magic.h index 05101f436c..bdaca968d2 100644 --- a/criu/include/magic.h +++ b/criu/include/magic.h @@ -94,6 +94,7 @@ #define BINFMT_MISC_MAGIC 0x67343323 /* Apatity */ #define AUTOFS_MAGIC 0x49353943 /* Sochi */ #define FILES_MAGIC 0x56303138 /* Toropets */ +#define MEMFD_INODE_MAGIC 0x48453499 /* Dnipro */ #define IFADDR_MAGIC RAW_IMAGE_MAGIC #define ROUTE_MAGIC RAW_IMAGE_MAGIC @@ -103,6 +104,7 @@ #define TMPFS_DEV_MAGIC RAW_IMAGE_MAGIC #define IPTABLES_MAGIC RAW_IMAGE_MAGIC #define IP6TABLES_MAGIC RAW_IMAGE_MAGIC +#define NFTABLES_MAGIC RAW_IMAGE_MAGIC #define NETNF_CT_MAGIC RAW_IMAGE_MAGIC #define NETNF_EXP_MAGIC RAW_IMAGE_MAGIC diff --git a/criu/include/memfd.h b/criu/include/memfd.h new file mode 100644 index 0000000000..4189766fdc --- /dev/null +++ b/criu/include/memfd.h @@ -0,0 +1,31 @@ +#ifndef __CR_MEMFD_H__ +#define __CR_MEMFD_H__ + +#include +#include "int.h" +#include "common/config.h" + +struct fd_parms; +struct file_desc; + +extern int is_memfd(dev_t dev); +extern int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms); +extern const struct fdtype_ops memfd_dump_ops; + +extern int memfd_open(struct file_desc *d, u32 *fdflags); +extern struct collect_image_info memfd_cinfo; +extern struct file_desc *collect_memfd(u32 id); +extern int apply_memfd_seals(void); + +#ifdef CONFIG_HAS_MEMFD_CREATE +# include +#else +# include +# include +static inline int memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} +#endif /* CONFIG_HAS_MEMFD_CREATE */ + +#endif /* __CR_MEMFD_H__ */ diff --git a/criu/include/mount.h b/criu/include/mount.h index d9b375f5d8..8bf19b2666 100644 --- a/criu/include/mount.h +++ b/criu/include/mount.h @@ -96,7 +96,7 @@ extern int collect_binfmt_misc(void); static inline int collect_binfmt_misc(void) { return 0; } #endif -extern struct mount_info *mnt_entry_alloc(); +extern struct mount_info *mnt_entry_alloc(void); extern void mnt_entry_free(struct mount_info *mi); extern int __mntns_get_root_fd(pid_t pid); diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h index 287abb3c8a..a9a970a9be 100644 --- a/criu/include/namespaces.h +++ b/criu/include/namespaces.h @@ -166,7 +166,6 @@ extern int restore_ns(int rst, struct ns_desc *nd); extern int dump_task_ns_ids(struct pstree_item *); extern int predump_task_ns_ids(struct pstree_item *); -extern struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type t); extern int rst_add_ns_id(unsigned int id, struct pstree_item *, struct ns_desc *nd); extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd); diff --git a/criu/include/net.h b/criu/include/net.h index 9976f6eb06..0a556f3da2 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -31,7 +31,7 @@ extern int collect_net_namespaces(bool for_dump); extern int network_lock(void); extern void network_unlock(void); -extern int network_lock_internal(); +extern int network_lock_internal(void); extern struct ns_desc net_ns_desc; @@ -47,11 +47,11 @@ extern int move_veth_to_bridge(void); extern int kerndat_link_nsid(void); extern int net_get_nsid(int rtsk, int fd, int *nsid); -extern struct ns_id *net_get_root_ns(); +extern struct ns_id *net_get_root_ns(void); extern int kerndat_nsid(void); extern void check_has_netns_ioc(int fd, bool *kdat_val, const char *name); extern int net_set_ext(struct ns_id *ns); -extern struct ns_id *get_root_netns(); -extern int read_net_ns_img(); +extern struct ns_id *get_root_netns(void); +extern int read_net_ns_img(void); #endif /* __CR_NET_H__ */ diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h index fa72273ea0..98061e2d3d 100644 --- a/criu/include/page-xfer.h +++ b/criu/include/page-xfer.h @@ -9,6 +9,9 @@ struct ps_info { extern int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd); +/* User buffer for read-mode pre-dump*/ +#define BUFFER_SIZE (PIPE_MAX_SIZE << PAGE_SHIFT) + /* * page_xfer -- transfer pages into image file. * Two images backends are implemented -- local image file @@ -48,6 +51,7 @@ struct page_xfer { extern int open_page_xfer(struct page_xfer *xfer, int fd_type, unsigned long id); struct page_pipe; extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *); +extern int page_xfer_predump_pages(int pid, struct page_xfer *, struct page_pipe *); extern int connect_to_page_server_to_send(void); extern int connect_to_page_server_to_recv(int epfd); extern int disconnect_from_page_server(void); diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h index 96a097b3d8..fd50ff47e1 100644 --- a/criu/include/proc_parse.h +++ b/criu/include/proc_parse.h @@ -3,7 +3,7 @@ #include -#include +#include "compel/infect.h" #define PROC_TASK_COMM_LEN 32 #define PROC_TASK_COMM_LEN_FMT "(%31s" diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index 31f5b9a796..d725d199ee 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -61,6 +61,12 @@ enum { PB_AUTOFS, PB_GHOST_CHUNK, PB_FILE, + PB_REMOTE_IMAGE, /* Header for images sent from proxy to cache.*/ + PB_LOCAL_IMAGE, /* Header for reading/writing images from/to proxy or cache. */ + PB_LOCAL_IMAGE_REPLY, /* Header for reading/writing images reply. */ + PB_SNAPSHOT_ID, /* Contains a single id. Used for reading/writing ids from proxy or cache. */ + PB_MEMFD_FILE, + PB_MEMFD_INODE, /* 60 */ /* PB_AUTOGEN_STOP */ diff --git a/criu/include/protobuf.h b/criu/include/protobuf.h index fb7489e9d4..0b6d8c1505 100644 --- a/criu/include/protobuf.h +++ b/criu/include/protobuf.h @@ -52,4 +52,11 @@ static inline int collect_images(struct collect_image_info **array, unsigned siz return 0; } +/* + * To speed up reading of packed objects + * by providing space on stack, this should + * be more than enough for most objects. + */ +#define PB_PKOBJ_LOCAL_SIZE 1024 + #endif /* __CR_PROTOBUF_H__ */ diff --git a/criu/include/pstree.h b/criu/include/pstree.h index 7303c1fedc..61ab0ce0eb 100644 --- a/criu/include/pstree.h +++ b/criu/include/pstree.h @@ -42,7 +42,7 @@ enum { }; #define FDS_EVENT (1 << FDS_EVENT_BIT) -struct pstree_item *current; +extern struct pstree_item *current; struct rst_info; /* See alloc_pstree_item() for details */ diff --git a/criu/include/restorer.h b/criu/include/restorer.h index b93807f5fb..dfb4e6b712 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -221,6 +221,7 @@ struct task_restore_args { #endif int lsm_type; int child_subreaper; + bool has_clone3_set_tid; } __aligned(64); /* diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index 07c634f4ad..3283849e44 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -4,6 +4,7 @@ #include "common/lock.h" #include "common/list.h" #include "vma.h" +#include "kerndat.h" struct task_entries { int nr_threads, nr_tasks, nr_helpers; diff --git a/criu/include/sched.h b/criu/include/sched.h new file mode 100644 index 0000000000..78f65e3b7e --- /dev/null +++ b/criu/include/sched.h @@ -0,0 +1,33 @@ +#ifndef __CR_SCHED_H__ +#define __CR_SCHED_H__ + +#include + +#ifndef ptr_to_u64 +#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) +#endif +#ifndef u64_to_ptr +#define u64_to_ptr(x) ((void *)(uintptr_t)x) +#endif + +/* + * This structure is needed by clone3(). The kernel + * calls it 'struct clone_args'. As CRIU will always + * need at least this part of the structure (VER1) + * to be able to test if clone3() with set_tid works, + * the structure is defined here as 'struct _clone_args'. + */ + +struct _clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; +}; +#endif /* __CR_SCHED_H__ */ diff --git a/criu/include/shmem.h b/criu/include/shmem.h index 04ab8d0763..9afdb799af 100644 --- a/criu/include/shmem.h +++ b/criu/include/shmem.h @@ -13,8 +13,11 @@ extern int collect_sysv_shmem(unsigned long shmid, unsigned long size); extern int cr_dump_shmem(void); extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map); extern int fixup_sysv_shmems(void); +extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size); extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid); extern int restore_sysv_shmem_content(void *addr, unsigned long size, unsigned long shmid); +extern int restore_memfd_shmem_content(int fd, unsigned long shmid, unsigned long size); + #define SYSV_SHMEM_SKIP_FD (0x7fffffff) diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h index 79966517bf..dec67ca6c0 100644 --- a/criu/include/sk-inet.h +++ b/criu/include/sk-inet.h @@ -83,7 +83,7 @@ extern void tcp_locked_conn_add(struct inet_sk_info *); extern void rst_unlock_tcp_connections(void); extern void cpt_unlock_tcp_connections(void); -extern int dump_one_tcp(int sk, struct inet_sk_desc *sd); +extern int dump_one_tcp(int sk, struct inet_sk_desc *sd, SkOptsEntry *soe); extern int restore_one_tcp(int sk, struct inet_sk_info *si); #define SK_EST_PARAM "tcp-established" diff --git a/criu/include/sockets.h b/criu/include/sockets.h index cd98d18e06..dcb1dd9e46 100644 --- a/criu/include/sockets.h +++ b/criu/include/sockets.h @@ -10,6 +10,7 @@ struct fdinfo_list_entry; struct sk_opts_entry; +struct mount_info; struct file_desc; struct fd_parms; struct cr_imgset; @@ -27,6 +28,7 @@ struct socket_desc { extern int dump_socket(struct fd_parms *p, int lfd, FdinfoEntry *); extern int dump_socket_opts(int sk, SkOptsEntry *soe); extern int restore_socket_opts(int sk, SkOptsEntry *soe); +extern int restore_socket_bufsz(int sk, SkOptsEntry *soe); extern void release_skopts(SkOptsEntry *); extern int restore_prepare_socket(int sk); extern void preload_socket_modules(void); @@ -42,6 +44,8 @@ extern int add_fake_unix_queuers(void); extern int fix_external_unix_sockets(void); extern int prepare_scms(void); extern int unix_note_scm_rights(int id_for, uint32_t *file_ids, int *fds, int n_ids); +extern int collect_unix_bindmounts(void); +extern int unix_prepare_bindmount(struct mount_info *mi); extern struct collect_image_info netlink_sk_cinfo; @@ -61,6 +65,7 @@ extern int netlink_receive_one(struct nlmsghdr *hdr, struct ns_id *ns, void *arg extern int unix_sk_id_add(unsigned int ino); extern int unix_sk_ids_parse(char *optarg); extern int unix_prepare_root_shared(void); +extern int unix_prepare_shared(void); extern int do_dump_opt(int sk, int level, int name, void *val, int len); #define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f)) diff --git a/criu/include/stats.h b/criu/include/stats.h index bab9a0507c..5d408b7b10 100644 --- a/criu/include/stats.h +++ b/criu/include/stats.h @@ -45,6 +45,7 @@ enum { }; extern void cnt_add(int c, unsigned long val); +extern void cnt_sub(int c, unsigned long val); #define DUMP_STATS 1 #define RESTORE_STATS 2 diff --git a/criu/include/tls.h b/criu/include/tls.h index aa25178876..b48e4b4808 100644 --- a/criu/include/tls.h +++ b/criu/include/tls.h @@ -4,7 +4,7 @@ # ifdef CONFIG_GNUTLS int tls_x509_init(int sockfd, bool is_server); -void tls_terminate_session(); +void tls_terminate_session(void); ssize_t tls_send(const void *buf, size_t len, int flags); ssize_t tls_recv(void *buf, size_t len, int flags); diff --git a/criu/include/tun.h b/criu/include/tun.h index ce0b266a64..b82c445a79 100644 --- a/criu/include/tun.h +++ b/criu/include/tun.h @@ -5,7 +5,7 @@ #define TUN_MINOR 200 #endif -struct ns_id *ns; +extern struct ns_id *ns; #include diff --git a/criu/include/util-vdso.h b/criu/include/util-vdso.h index 33b7411dee..046cd96d7e 100644 --- a/criu/include/util-vdso.h +++ b/criu/include/util-vdso.h @@ -41,6 +41,11 @@ struct vdso_maps { bool compatible; }; +static inline bool vdso_is_present(struct vdso_maps *m) +{ + return m->vdso_start != VDSO_BAD_ADDR; +} + #define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, } #define VDSO_SYMTABLE_INIT \ diff --git a/criu/include/util.h b/criu/include/util.h index 313aacd8c2..778b1b1197 100644 --- a/criu/include/util.h +++ b/criu/include/util.h @@ -290,11 +290,9 @@ char *xstrcat(char *str, const char *fmt, ...) char *xsprintf(const char *fmt, ...) __attribute__ ((__format__ (__printf__, 1, 2))); -void print_data(unsigned long addr, unsigned char *data, size_t size); - -int setup_tcp_server(char *type, char *addr, unsigned short *port); +int setup_tcp_server(char *type); int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk); -int setup_tcp_client(char *hostname); +int setup_tcp_client(void); #define LAST_PID_PATH "sys/kernel/ns_last_pid" #define PID_MAX_PATH "sys/kernel/pid_max" @@ -380,4 +378,6 @@ static inline void print_stack_trace(pid_t pid) {} ___ret; \ }) +extern int mount_detached_fs(const char *fsname); + #endif /* __CR_UTIL_H__ */ diff --git a/criu/kerndat.c b/criu/kerndat.c index 39cacb8fef..2ad72c3505 100644 --- a/criu/kerndat.c +++ b/criu/kerndat.c @@ -33,7 +33,6 @@ #include "net.h" #include "tun.h" #include -#include #include "netfilter.h" #include "fsnotify.h" #include "linux/userfaultfd.h" @@ -41,6 +40,8 @@ #include "uffd.h" #include "vdso.h" #include "kcmp.h" +#include "sched.h" +#include "memfd.h" struct kerndat_s kdat = { }; @@ -364,7 +365,7 @@ static int kerndat_get_dirty_track(void) } /* The page frame number (PFN) is constant for the zero page */ -static int init_zero_page_pfn() +static int init_zero_page_pfn(void) { void *addr; int ret = 0; @@ -408,7 +409,7 @@ static bool kerndat_has_memfd_create(void) { int ret; - ret = syscall(SYS_memfd_create, NULL, 0); + ret = memfd_create(NULL, 0); if (ret == -1 && errno == ENOSYS) kdat.has_memfd = false; @@ -429,7 +430,7 @@ static int get_task_size(void) return 0; } -static int kerndat_fdinfo_has_lock() +static int kerndat_fdinfo_has_lock(void) { int fd, pfd = -1, exit_code = -1, len; char buf[PAGE_SIZE]; @@ -464,7 +465,7 @@ static int kerndat_fdinfo_has_lock() return exit_code; } -static int get_ipv6() +static int get_ipv6(void) { if (access("/proc/sys/net/ipv6", F_OK) < 0) { if (errno == ENOENT) { @@ -723,6 +724,20 @@ static int kerndat_has_inotify_setnextwd(void) return ret; } +static int kerndat_has_fsopen(void) +{ + if (syscall(__NR_fsopen, NULL, -1) != -1) { + pr_err("fsopen should fail\n"); + return -1; + } + if (errno == ENOSYS) + pr_info("The new mount API (fsopen, fsmount) isn't supported\n"); + else + kdat.has_fsopen = true; + + return 0; +} + static int has_kcmp_epoll_tfd(void) { kcmp_epoll_slot_t slot = { }; @@ -972,6 +987,35 @@ static int kerndat_tun_netns(void) return check_tun_netns_cr(&kdat.tun_ns); } +static bool kerndat_has_clone3_set_tid(void) +{ + pid_t pid; + struct _clone_args args = {}; + + args.set_tid = -1; + /* + * On a system without clone3() this will return ENOSYS. + * On a system with clone3() but without set_tid this + * will return E2BIG. + * On a system with clone3() and set_tid it will return + * EINVAL. + */ + pid = syscall(__NR_clone3, &args, sizeof(args)); + + if (pid == -1 && (errno == ENOSYS || errno == E2BIG)) { + kdat.has_clone3_set_tid = false; + return 0; + } + if (pid == -1 && errno == EINVAL) { + kdat.has_clone3_set_tid = true; + } else { + pr_perror("Unexpected error from clone3\n"); + return -1; + } + + return 0; +} + int kerndat_init(void) { int ret; @@ -1043,6 +1087,10 @@ int kerndat_init(void) ret = kerndat_has_inotify_setnextwd(); if (!ret) ret = has_kcmp_epoll_tfd(); + if (!ret) + ret = kerndat_has_fsopen(); + if (!ret) + ret = kerndat_has_clone3_set_tid(); kerndat_lsm(); kerndat_mmap_min_addr(); diff --git a/criu/log.c b/criu/log.c index 8bdf835341..0ee113b91a 100644 --- a/criu/log.c +++ b/criu/log.c @@ -199,8 +199,8 @@ void flush_early_log_buffer(int fd) } pos += hdr->len; } - if (early_log_buf_off) - pr_warn("The early log isn't empty\n"); + if (early_log_buf_off == EARLY_LOG_BUF_LEN) + pr_warn("The early log buffer is full, some messages may have been lost\n"); early_log_buf_off = 0; } diff --git a/criu/lsm.c b/criu/lsm.c index 9d7e55c11b..060f102592 100644 --- a/criu/lsm.c +++ b/criu/lsm.c @@ -133,7 +133,7 @@ static int selinux_get_sockcreate_label(pid_t pid, char **output) return 0; } -int reset_setsockcreatecon() +int reset_setsockcreatecon(void) { /* Currently this only works for SELinux. */ if (kdat.lsm != LSMTYPE__SELINUX) diff --git a/criu/mem.c b/criu/mem.c index de66a62104..55022d94a2 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -29,7 +29,7 @@ #include "pagemap-cache.h" #include "fault-injection.h" #include "prctl.h" -#include +#include "compel/infect-util.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" @@ -351,7 +351,8 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl, pmc_t *pmc, - bool has_parent, bool pre_dump) + bool has_parent, bool pre_dump, + int parent_predump_mode) { u64 off = 0; u64 *map; @@ -361,6 +362,52 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, !vma_area_is(vma, VMA_ANON_SHARED)) return 0; + /* + * To facilitate any combination of pre-dump modes to run after + * one another, we need to take extra care as discussed below. + * + * The SPLICE mode pre-dump, processes all type of memory regions, + * whereas READ mode pre-dump skips processing those memory regions + * which lacks PROT_READ flag. + * + * Now on mixing pre-dump modes: + * If SPLICE mode follows SPLICE mode : no issue + * -> everything dumped both the times + * + * If READ mode follows READ mode : no issue + * -> non-PROT_READ skipped both the time + * + * If READ mode follows SPLICE mode : no issue + * -> everything dumped at first, + * the non-PROT_READ skipped later + * + * If SPLICE mode follows READ mode : Need special care + * + * If READ pre-dump happens first, then it has skipped processing + * non-PROT_READ regions. Following SPLICE pre-dump expects pagemap + * entries for all mappings in parent pagemap, but last READ mode + * pre-dump cycle has skipped processing & pagemap generation for + * non-PROT_READ regions. So SPLICE mode throws error of missing + * pagemap entry for encountered non-PROT_READ mapping. + * + * To resolve this, the pre-dump-mode is stored in current pre-dump's + * inventoy file. This pre-dump mode is read back from this file + * (present in parent pre-dump dir) as parent-pre-dump-mode during + * next pre-dump. + * + * If parent-pre-dump-mode and next-pre-dump-mode are in READ-mode -> + * SPLICE-mode order, then SPLICE mode doesn't expect mappings for + * non-PROT_READ regions in parent-image and marks "has_parent=false". + */ + + if (!(vma->e->prot & PROT_READ)) { + if (opts.pre_dump_mode == PRE_DUMP_READ && pre_dump) + return 0; + if ((parent_predump_mode == PRE_DUMP_READ && + opts.pre_dump_mode == PRE_DUMP_SPLICE) || !pre_dump) + has_parent = false; + } + if (vma_entry_is(vma->e, VMA_AREA_AIORING)) { if (pre_dump) return 0; @@ -406,6 +453,7 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, unsigned long pmc_size; int possible_pid_reuse = 0; bool has_parent; + int parent_predump_mode = -1; pr_info("\n"); pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, item->pid->real); @@ -472,9 +520,13 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, */ args->off = 0; has_parent = !!xfer.parent && !possible_pid_reuse; + if(mdc->parent_ie) + parent_predump_mode = mdc->parent_ie->pre_dump_mode; + list_for_each_entry(vma_area, &vma_area_list->h, list) { ret = generate_vma_iovs(item, vma_area, pp, &xfer, args, ctl, - &pmc, has_parent, mdc->pre_dump); + &pmc, has_parent, mdc->pre_dump, + parent_predump_mode); if (ret < 0) goto out_xfer; } @@ -482,7 +534,18 @@ static int __parasite_dump_pages_seized(struct pstree_item *item, if (mdc->lazy) memcpy(pargs_iovs(args), pp->iovs, sizeof(struct iovec) * pp->nr_iovs); - ret = drain_pages(pp, ctl, args); + + /* + * Faking drain_pages for pre-dump here. Actual drain_pages for pre-dump + * will happen after task unfreezing in cr_pre_dump_finish(). This is + * actual optimization which reduces time for which process was frozen + * during pre-dump. + */ + if (mdc->pre_dump && opts.pre_dump_mode == PRE_DUMP_READ) + ret = 0; + else + ret = drain_pages(pp, ctl, args); + if (!ret && !mdc->pre_dump) ret = xfer_pages(pp, &xfer); if (ret) @@ -528,13 +591,47 @@ int parasite_dump_pages_seized(struct pstree_item *item, * able to read the memory contents. * * Afterwards -- reprotect memory back. + * + * This step is required for "splice" mode pre-dump and dump. + * Skip this step for "read" mode pre-dump. + * "read" mode pre-dump delegates processing of non-PROT_READ + * regions to dump stage. Adding PROT_READ works fine for + * static processing (target process frozen during pre-dump) + * and fails for dynamic as explained below. + * + * Consider following sequence of instances to reason, why + * not to add PROT_READ in "read" mode pre-dump ? + * + * CRIU- "read" pre-dump Target Process + * + * 1. Creates mapping M + * without PROT_READ + * 2. CRIU freezes target + * process + * 3. Collect the mappings + * 4. Add PROT_READ to M + * (non-PROT_READ region) + * 5. CRIU unfreezes target + * process + * 6. Add flag PROT_READ + * to mapping M + * 7. Revoke flag PROT_READ + * from mapping M + * 8. process_vm_readv tries + * to copy mapping M + * (believing M have + * PROT_READ flag) + * 9. syscall fails to copy + * data from M */ - pargs->add_prot = PROT_READ; - ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); - if (ret) { - pr_err("Can't dump unprotect vmas with parasite\n"); - return ret; + if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { + pargs->add_prot = PROT_READ; + ret = compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl); + if (ret) { + pr_err("Can't dump unprotect vmas with parasite\n"); + return ret; + } } if (fault_injected(FI_DUMP_PAGES)) { @@ -549,10 +646,12 @@ int parasite_dump_pages_seized(struct pstree_item *item, return ret; } - pargs->add_prot = 0; - if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { - pr_err("Can't rollback unprotected vmas with parasite\n"); - ret = -1; + if (!mdc->pre_dump || opts.pre_dump_mode == PRE_DUMP_SPLICE) { + pargs->add_prot = 0; + if (compel_rpc_call_sync(PARASITE_CMD_MPROTECT_VMAS, ctl)) { + pr_err("Can't rollback unprotected vmas with parasite\n"); + ret = -1; + } } return ret; diff --git a/criu/memfd.c b/criu/memfd.c new file mode 100644 index 0000000000..bca6900cb9 --- /dev/null +++ b/criu/memfd.c @@ -0,0 +1,470 @@ +#include +#include + +#include "common/compiler.h" +#include "common/lock.h" +#include "memfd.h" +#include "fdinfo.h" +#include "imgset.h" +#include "image.h" +#include "util.h" +#include "log.h" +#include "files.h" +#include "fs-magic.h" +#include "kerndat.h" +#include "files-reg.h" +#include "rst-malloc.h" +#include "fdstore.h" +#include "file-ids.h" +#include "namespaces.h" +#include "shmem.h" + +#include "protobuf.h" +#include "images/memfd.pb-c.h" + +#define MEMFD_PREFIX "/memfd:" +#define MEMFD_PREFIX_LEN (sizeof(MEMFD_PREFIX)-1) + +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +/* Linux 5.1+ */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ + +struct memfd_inode { + struct list_head list; + u32 id; + union { + /* Only for dump */ + struct { + u32 dev; + u32 ino; + }; + /* Only for restore */ + struct { + mutex_t lock; + int fdstore_id; + unsigned int pending_seals; + }; + }; +}; + +static LIST_HEAD(memfd_inodes); + +/* + * Dump only + */ + +static u32 memfd_inode_ids = 1; + +int is_memfd(dev_t dev) +{ + /* + * TODO When MAP_HUGETLB is used, the file device is not shmem_dev, + * Note that other parts of CRIU have similar issues, see + * is_anon_shmem_map(). + */ + return dev == kdat.shmem_dev; +} + +static int dump_memfd_inode(int fd, struct memfd_inode *inode, + const char *name, const struct stat *st) +{ + int ret = -1; + struct cr_img *img = NULL; + MemfdInodeEntry mie = MEMFD_INODE_ENTRY__INIT; + u32 shmid; + + /* + * shmids are chosen as the inode number of the corresponding mmaped + * file. See handle_vma() in proc_parse.c. + * It works for memfd too, because we share the same device as the + * shmem device. + */ + shmid = inode->ino; + + pr_info("Dumping memfd:%s contents (id %#x, shmid: %#x, size: %"PRIu64")\n", + name, inode->id, shmid, st->st_size); + + if (dump_one_memfd_shmem(fd, shmid, st->st_size) < 0) + goto out; + + img = open_image(CR_FD_MEMFD_INODE, O_DUMP, inode->id); + if (!img) + goto out; + + mie.uid = userns_uid(st->st_uid); + mie.gid = userns_gid(st->st_gid); + mie.name = (char *)name; + mie.size = st->st_size; + mie.shmid = shmid; + + mie.seals = fcntl(fd, F_GET_SEALS); + if (mie.seals == -1) + goto out; + + if (pb_write_one(img, &mie, PB_MEMFD_INODE)) + goto out; + + ret = 0; + +out: + if (img) + close_image(img); + return ret; +} + +static struct memfd_inode *dump_unique_memfd_inode(int lfd, const char *name, const struct stat *st) +{ + struct memfd_inode *inode; + int fd; + + list_for_each_entry(inode, &memfd_inodes, list) + if ((inode->dev == st->st_dev) && (inode->ino == st->st_ino)) + return inode; + + inode = xmalloc(sizeof(*inode)); + if (inode == NULL) + return NULL; + + inode->dev = st->st_dev; + inode->ino = st->st_ino; + inode->id = memfd_inode_ids++; + + fd = open_proc(PROC_SELF, "fd/%d", lfd); + if (fd < 0) { + xfree(inode); + return NULL; + } + + if (dump_memfd_inode(fd, inode, name, st)) { + close(fd); + xfree(inode); + return NULL; + } + close(fd); + + list_add_tail(&inode->list, &memfd_inodes); + + return inode; +} + +static int dump_one_memfd(int lfd, u32 id, const struct fd_parms *p) +{ + MemfdFileEntry mfe = MEMFD_FILE_ENTRY__INIT; + FileEntry fe = FILE_ENTRY__INIT; + struct memfd_inode *inode; + struct fd_link _link, *link; + const char *name; + + if (!p->link) { + if (fill_fdlink(lfd, p, &_link)) + return -1; + link = &_link; + } else + link = p->link; + + strip_deleted(link); + /* link->name is always started with "." which has to be skipped. */ + if (strncmp(link->name + 1, MEMFD_PREFIX, MEMFD_PREFIX_LEN) == 0) + name = &link->name[1 + MEMFD_PREFIX_LEN]; + else + name = link->name + 1; + + inode = dump_unique_memfd_inode(lfd, name, &p->stat); + if (!inode) + return -1; + + mfe.id = id; + mfe.flags = p->flags; + mfe.pos = p->pos; + mfe.fown = (FownEntry *)&p->fown; + mfe.inode_id = inode->id; + + fe.type = FD_TYPES__MEMFD; + fe.id = mfe.id; + fe.memfd = &mfe; + + return pb_write_one(img_from_set(glob_imgset, CR_FD_FILES), &fe, PB_FILE); +} + +int dump_one_memfd_cond(int lfd, u32 *id, struct fd_parms *parms) +{ + if (fd_id_generate_special(parms, id)) + return dump_one_memfd(lfd, *id, parms); + return 0; +} + +const struct fdtype_ops memfd_dump_ops = { + .type = FD_TYPES__MEMFD, + .dump = dump_one_memfd, +}; + + +/* + * Restore only + */ + +struct memfd_info { + MemfdFileEntry *mfe; + struct file_desc d; + struct memfd_inode *inode; +}; + +static int memfd_open_inode(struct memfd_inode *inode); + +static struct memfd_inode *memfd_alloc_inode(int id) +{ + struct memfd_inode *inode; + + list_for_each_entry(inode, &memfd_inodes, list) + if (inode->id == id) + return inode; + + inode = shmalloc(sizeof(*inode)); + if (!inode) + return NULL; + + inode->id = id; + mutex_init(&inode->lock); + inode->fdstore_id = -1; + inode->pending_seals = 0; + + list_add_tail(&inode->list, &memfd_inodes); + return inode; +} + +extern int restore_memfd_shm(int fd, u64 id, u64 size); +static int memfd_open_inode_nocache(struct memfd_inode *inode) +{ + MemfdInodeEntry *mie = NULL; + struct cr_img *img = NULL; + int fd = -1; + int ret = -1; + int flags; + + img = open_image(CR_FD_MEMFD_INODE, O_RSTR, inode->id); + if (!img) + goto out; + + if (pb_read_one(img, &mie, PB_MEMFD_INODE) < 0) + goto out; + + if (mie->seals == F_SEAL_SEAL) { + inode->pending_seals = 0; + flags = 0; + } else { + /* Seals are applied later due to F_SEAL_FUTURE_WRITE */ + inode->pending_seals = mie->seals; + flags = MFD_ALLOW_SEALING; + } + + fd = memfd_create(mie->name, flags); + if (fd < 0) { + pr_perror("Can't create memfd:%s", mie->name); + goto out; + } + + if (restore_memfd_shmem_content(fd, mie->shmid, mie->size)) + goto out; + + if (fchown(fd, mie->uid, mie->gid)) { + pr_perror("Can't change uid %d gid %d of memfd:%s", + (int)mie->uid, (int)mie->gid, mie->name); + goto out; + } + + inode->fdstore_id = fdstore_add(fd); + if (inode->fdstore_id < 0) + goto out; + + ret = fd; + fd = -1; + +out: + if (fd != -1) + close(fd); + if (img) + close_image(img); + if (mie) + memfd_inode_entry__free_unpacked(mie, NULL); + return ret; +} + +static int memfd_open_inode(struct memfd_inode *inode) +{ + int fd; + + if (inode->fdstore_id != -1) + return fdstore_get(inode->fdstore_id); + + mutex_lock(&inode->lock); + if (inode->fdstore_id != -1) + fd = fdstore_get(inode->fdstore_id); + else + fd = memfd_open_inode_nocache(inode); + mutex_unlock(&inode->lock); + + return fd; +} + +int memfd_open(struct file_desc *d, u32 *fdflags) +{ + struct memfd_info *mfi; + MemfdFileEntry *mfe; + int fd, _fd; + u32 flags; + + mfi = container_of(d, struct memfd_info, d); + mfe = mfi->mfe; + + if (inherited_fd(d, &fd)) + return fd; + + pr_info("Restoring memfd id=%d\n", mfe->id); + + fd = memfd_open_inode(mfi->inode); + if (fd < 0) + goto err; + + /* Reopen the fd with original permissions */ + flags = fdflags ? *fdflags : mfe->flags; + /* + * Ideally we should call compat version open() to not force the + * O_LARGEFILE file flag with regular open(). It doesn't seem that + * important though. + */ + _fd = __open_proc(getpid(), 0, flags, "fd/%d", fd); + if (_fd < 0) { + pr_perror("Can't reopen memfd id=%d", mfe->id); + goto err; + } + close(fd); + fd = _fd; + + if (restore_fown(fd, mfe->fown) < 0) + goto err; + + if (lseek(fd, mfe->pos, SEEK_SET) < 0) { + pr_perror("Can't restore file position of memfd id=%d", mfe->id); + goto err; + } + + return fd; + +err: + if (fd >= 0) + close(fd); + return -1; +} + +static int memfd_open_fe_fd(struct file_desc *fd, int *new_fd) +{ + int tmp; + + tmp = memfd_open(fd, NULL); + if (tmp < 0) + return -1; + *new_fd = tmp; + return 0; +} + +static char *memfd_d_name(struct file_desc *d, char *buf, size_t s) +{ + MemfdInodeEntry *mie = NULL; + struct cr_img *img = NULL; + struct memfd_info *mfi; + char *ret = NULL; + + mfi = container_of(d, struct memfd_info, d); + + img = open_image(CR_FD_MEMFD_INODE, O_RSTR, mfi->inode->id); + if (!img) + goto out; + + if (pb_read_one(img, &mie, PB_MEMFD_INODE) < 0) + goto out; + + if (snprintf(buf, s, "%s%s", MEMFD_PREFIX, mie->name) >= s) { + pr_err("Buffer too small for memfd name %s\n", mie->name); + goto out; + } + + ret = buf; + +out: + if (img) + close_image(img); + if (mie) + memfd_inode_entry__free_unpacked(mie, NULL); + + return ret; +} + +static struct file_desc_ops memfd_desc_ops = { + .type = FD_TYPES__MEMFD, + .open = memfd_open_fe_fd, + .name = memfd_d_name, +}; + +static int collect_one_memfd(void *o, ProtobufCMessage *msg, struct cr_img *i) +{ + struct memfd_info *info = o; + + info->mfe = pb_msg(msg, MemfdFileEntry); + info->inode = memfd_alloc_inode(info->mfe->inode_id); + if (!info->inode) + return -1; + + return file_desc_add(&info->d, info->mfe->id, &memfd_desc_ops); +} + +struct collect_image_info memfd_cinfo = { + .fd_type = CR_FD_MEMFD_FILE, + .pb_type = PB_MEMFD_FILE, + .priv_size = sizeof(struct memfd_info), + .collect = collect_one_memfd, +}; + +struct file_desc *collect_memfd(u32 id) { + struct file_desc *fdesc; + + fdesc = find_file_desc_raw(FD_TYPES__MEMFD, id); + if (fdesc == NULL) + pr_err("No entry for memfd %#x\n", id); + + return fdesc; +} + +int apply_memfd_seals(void) +{ + /* + * We apply the seals after all the mappings are done because the seal + * F_SEAL_FUTURE_WRITE prevents future write access (added in + * Linux 5.1). Thus we must make sure all writable mappings are opened + * before applying this seal. + */ + + int ret, fd; + struct memfd_inode *inode; + + list_for_each_entry(inode, &memfd_inodes, list) { + if (!inode->pending_seals) + continue; + + fd = memfd_open_inode(inode); + if (fd < 0) + return -1; + + ret = fcntl(fd, F_ADD_SEALS, inode->pending_seals); + close(fd); + + if (ret < 0) { + pr_perror("Cannot apply seals on memfd"); + return -1; + } + } + + return 0; +} diff --git a/criu/mount.c b/criu/mount.c index 486d017197..208a492c10 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -28,6 +28,7 @@ #include "clone-noasan.h" #include "fdstore.h" +#include "sockets.h" #include "images/mnt.pb-c.h" /* @@ -330,7 +331,7 @@ static bool mounts_equal(struct mount_info *a, struct mount_info *b) */ static char *mnt_roots; -static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mount_info *yard_mount) +static struct mount_info *mnt_build_ids_tree(struct mount_info *list) { struct mount_info *m, *root = NULL; @@ -351,41 +352,14 @@ static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mou if (!parent) { /* Only a root mount can be without parent */ - if (root == NULL && m->is_ns_root) { + if (!root && m->is_ns_root) { root = m; - if (!yard_mount) - continue; - } - - if (!root) { - pr_err("No parent found for mountpoint %d (@%s)\n", - m->mnt_id, m->mountpoint); - return NULL; - } - - pr_debug("Mountpoint %d (@%s) w/o parent %d\n", - m->mnt_id, m->mountpoint, m->parent_mnt_id); - - if (!mounts_sb_equal(root, m) || - strcmp(root->root, m->root)) { - pr_err("Nested mount namespaces with different " - "roots %d (@%s %s) %d (@%s %s) are not supported yet\n", - root->mnt_id, root->mountpoint, root->root, - m->mnt_id, m->mountpoint, m->root); - return NULL; - } - - /* Mount all namespace roots into the roots yard. */ - parent = yard_mount; - if (unlikely(!yard_mount)) { - pr_err("Nested mount %d (@%s %s) w/o root insertion detected\n", - m->mnt_id, m->mountpoint, m->root); - return NULL; + continue; } - pr_debug("Mountpoint %d (@%s) get parent %d (@%s)\n", - m->mnt_id, m->mountpoint, - parent->mnt_id, parent->mountpoint); + pr_err("No parent found for mountpoint %d (@%s)\n", + m->mnt_id, m->mountpoint); + return NULL; } m->parent = parent; @@ -397,9 +371,6 @@ static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mou return NULL; } - if (yard_mount) - return yard_mount; - return root; } @@ -415,13 +386,12 @@ static unsigned int mnt_depth(struct mount_info *m) return depth; } -static void mnt_resort_siblings(struct mount_info *tree) +static void __mnt_resort_children(struct mount_info *parent) { - struct mount_info *m, *p; LIST_HEAD(list); /* - * Put siblings of each node in an order they can be (u)mounted + * Put children mounts in an order they can be (u)mounted * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/ * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order. * Otherwise we will not be able to (u)mount them in a sequence. @@ -433,11 +403,12 @@ static void mnt_resort_siblings(struct mount_info *tree) * to contain hundreds (or more) elements. */ - pr_info("\tResorting siblings on %d\n", tree->mnt_id); - while (!list_empty(&tree->children)) { + pr_info("\tResorting children of %d in mount order\n", parent->mnt_id); + while (!list_empty(&parent->children)) { + struct mount_info *m, *p; unsigned int depth; - m = list_first_entry(&tree->children, struct mount_info, siblings); + m = list_first_entry(&parent->children, struct mount_info, siblings); list_del(&m->siblings); depth = mnt_depth(m); @@ -446,10 +417,31 @@ static void mnt_resort_siblings(struct mount_info *tree) break; list_add_tail(&m->siblings, &p->siblings); - mnt_resort_siblings(m); } - list_splice(&list, &tree->children); + list_splice(&list, &parent->children); +} + +static struct mount_info *mnt_subtree_next(struct mount_info *mi, + struct mount_info *root); + +static void resort_siblings(struct mount_info *root, + void (*resort_children)(struct mount_info *)) { + struct mount_info *mi = root; + while (1) { + /* + * Explanation: sorting the children of the tree like these is + * safe and does not break the tree search in mnt_subtree_next + * (DFS-next search), as we sort children before calling next + * on parent and thus before DFS-next ever touches them, so + * from the perspective of DFS-next all children look like they + * are already sorted. + */ + resort_children(mi); + mi = mnt_subtree_next(mi, root); + if (!mi) + break; + } } static void mnt_tree_show(struct mount_info *tree, int off) @@ -997,8 +989,7 @@ static int resolve_shared_mounts(struct mount_info *info, int root_master_id) return 0; } -static struct mount_info *mnt_build_tree(struct mount_info *list, - struct mount_info *root_mp) +static struct mount_info *mnt_build_tree(struct mount_info *list) { struct mount_info *tree; @@ -1007,11 +998,11 @@ static struct mount_info *mnt_build_tree(struct mount_info *list, */ pr_info("Building mountpoints tree\n"); - tree = mnt_build_ids_tree(list, root_mp); + tree = mnt_build_ids_tree(list); if (!tree) return NULL; - mnt_resort_siblings(tree); + resort_siblings(tree, __mnt_resort_children); pr_info("Done:\n"); mnt_tree_show(tree, 0); return tree; @@ -1335,8 +1326,10 @@ int ns_open_mountpoint(void *arg) } /* Remount all mounts as private to disable propagation */ - if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) + if (mount("none", "/", NULL, MS_REC|MS_PRIVATE, NULL)) { + pr_perror("Unable to remount"); goto err; + } if (umount_overmounts(mi)) goto err; @@ -1546,6 +1539,7 @@ static __maybe_unused int mount_cr_time_mount(struct ns_id *ns, unsigned int *s_ ret = mount(source, target, type, 0, NULL); if (ret < 0) { + pr_perror("Unable to mount %s %s", source, target); exit_code = -errno; goto restore_ns; } else { @@ -1690,7 +1684,7 @@ struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump) return NULL; } - ns->mnt.mntinfo_tree = mnt_build_tree(pm, NULL); + ns->mnt.mntinfo_tree = mnt_build_tree(pm); if (ns->mnt.mntinfo_tree == NULL) goto err; @@ -2014,7 +2008,10 @@ static int fetch_rt_stat(struct mount_info *m, const char *where) static int do_simple_mount(struct mount_info *mi, const char *src, const char *fstype, unsigned long mountflags) { - return mount(src, mi->mountpoint, fstype, mountflags, mi->options); + int ret = mount(src, mi->mountpoint, fstype, mountflags, mi->options); + if (ret) + pr_perror("Unable to mount %s %s (id=%d)", src, mi->mountpoint, mi->mnt_id); + return ret; } static char *mnt_fsname(struct mount_info *mi) @@ -2024,20 +2021,20 @@ static char *mnt_fsname(struct mount_info *mi) return mi->fstype->name; } -static int apply_sb_flags(void *args, int fd, pid_t pid) +static int userns_mount(char *src, void *args, int fd, pid_t pid) { unsigned long flags = *(unsigned long *) args; int rst = -1, err = -1; - char path[PSFDS]; + char target[PSFDS]; - snprintf(path, sizeof(path), "/proc/self/fd/%d", fd); + snprintf(target, sizeof(target), "/proc/self/fd/%d", fd); if (pid != getpid() && switch_ns(pid, &mnt_ns_desc, &rst)) return -1; - err = mount(NULL, path, NULL, MS_REMOUNT | flags, NULL); + err = mount(src, target, NULL, flags, NULL); if (err) - pr_perror("Unable to remount %s", path); + pr_perror("Unable to mount %s", target); if (rst >= 0 && restore_ns(rst, &mnt_ns_desc)) return -1; @@ -2045,6 +2042,16 @@ static int apply_sb_flags(void *args, int fd, pid_t pid) return err; } +static int apply_sb_flags(void *args, int fd, pid_t pid) +{ + return userns_mount(NULL, args, fd, pid); +} + +static int mount_root(void *args, int fd, pid_t pid) +{ + return userns_mount(opts.root, args, fd, pid); +} + static int do_new_mount(struct mount_info *mi) { unsigned long sflags = mi->sb_flags; @@ -2092,10 +2099,9 @@ static int do_new_mount(struct mount_info *mi) pr_perror("Unable to open %s", mi->mountpoint); return -1; } - sflags |= MS_RDONLY; - if (userns_call(apply_sb_flags, 0, - &sflags, sizeof(sflags), fd)) { - pr_perror("Unable to apply mount flags %d for %s", + sflags |= MS_RDONLY | MS_REMOUNT; + if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { + pr_err("Unable to apply mount flags %d for %s", mi->sb_flags, mi->mountpoint); close(fd); return -1; @@ -2135,7 +2141,7 @@ static int restore_ext_mount(struct mount_info *mi) static char mnt_clean_path[] = "/tmp/cr-tmpfs.XXXXXX"; -static int mount_clean_path() +static int mount_clean_path(void) { /* * To make a bind mount, we need to have access to a source directory, @@ -2162,7 +2168,7 @@ static int mount_clean_path() return 0; } -static int umount_clean_path() +static int umount_clean_path(void) { if (umount2(mnt_clean_path, MNT_DETACH)) { pr_perror("Unable to umount %s", mnt_clean_path); @@ -2229,6 +2235,12 @@ static int do_bind_mount(struct mount_info *mi) mnt_path = mnt_fd_path; } + if (unix_prepare_bindmount(mi)) { + pr_err("Failed to prepare bindmount on unix at %s\n", + mi->mountpoint); + goto err; + } + if (cut_root[0] == 0) /* This case is handled by mi->bind->fd */ goto skip_overmount_check; @@ -2495,14 +2507,35 @@ static int do_mount_one(struct mount_info *mi) pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin); if (rst_mnt_is_root(mi)) { + int fd; + unsigned long flags = MS_BIND | MS_REC; + if (opts.root == NULL) { pr_err("The --root option is required to restore a mount namespace\n"); return -1; } /* do_mount_root() is called from populate_mnt_ns() */ - if (mount(opts.root, mi->mountpoint, NULL, MS_BIND | MS_REC, NULL)) - return -1; + if (root_ns_mask & CLONE_NEWUSER) { + fd = open(mi->mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Unable to open %s", mi->mountpoint); + return -1; + } + + if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { + pr_err("Unable to mount %s\n", mi->mountpoint); + close(fd); + return -1; + } + close(fd); + } else { + if (mount(opts.root, mi->mountpoint, NULL, flags, NULL)) { + pr_perror("Unable to mount %s %s (id=%d)", opts.root, mi->mountpoint, mi->mnt_id); + return -1; + } + } + if (do_mount_root(mi)) return -1; mi->mounted = true; @@ -2589,6 +2622,7 @@ static int try_remap_mount(struct mount_info *m) struct mnt_remap_entry *r; if (!mnt_needs_remap(m)) + return 0; BUG_ON(!m->parent); @@ -2633,7 +2667,7 @@ static int find_remap_mounts(struct mount_info *root) } /* Move remapped mounts to places where they have to be */ -static int fixup_remap_mounts() +static int fixup_remap_mounts(void) { struct mnt_remap_entry *r; @@ -2881,7 +2915,7 @@ static int get_mp_mountpoint(char *mountpoint, struct mount_info *mi, char *root return 0; } -static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) +static int collect_mnt_from_image(struct mount_info **head, struct mount_info **tail, struct ns_id *nsid) { MntEntry *me = NULL; int ret, root_len = 1; @@ -2909,8 +2943,10 @@ static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid) goto err; pm->nsid = nsid; - pm->next = *pms; - *pms = pm; + pm->next = *head; + *head = pm; + if (!*tail) + *tail = pm; pm->mnt_id = me->mnt_id; pm->parent_mnt_id = me->parent_mnt_id; @@ -2989,12 +3025,26 @@ int read_mnt_ns_img(void) struct mount_info *pms = NULL; struct ns_id *nsid; + if (!(root_ns_mask & CLONE_NEWNS)) { + mntinfo = NULL; + return 0; + } + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + struct mount_info *head = NULL, *tail = NULL; + if (nsid->nd != &mnt_ns_desc) continue; - if (collect_mnt_from_image(&pms, nsid)) + if (collect_mnt_from_image(&head, &tail, nsid)) return -1; + + nsid->mnt.mntinfo_tree = mnt_build_tree(head); + if (!nsid->mnt.mntinfo_tree) + return -1; + + tail->next = pms; + pms = head; } mntinfo = pms; @@ -3096,6 +3146,40 @@ void fini_restore_mntns(void) } } +static int merge_mount_trees(struct mount_info *root_yard) +{ + struct mount_info *first = NULL; + struct ns_id *nsid; + + /* Merge mount trees together under root_yard */ + for (nsid = ns_ids; nsid; nsid = nsid->next) { + struct mount_info *root; + + if (nsid->nd != &mnt_ns_desc) + continue; + + root = nsid->mnt.mntinfo_tree; + + if (!first) + first = root; + else if (!mounts_sb_equal(root, first) || + strcmp(root->root, first->root)) { + pr_err("Nested mount namespaces with different " + "roots %d (@%s %s) %d (@%s %s) are not supported yet\n", + root->mnt_id, root->mountpoint, root->root, + first->mnt_id, first->mountpoint, first->root); + return -1; + } + + pr_debug("Mountpoint %d (@%s) moved to the root yard\n", + root->mnt_id, root->mountpoint); + root->parent = root_yard; + list_add(&root->siblings, &root_yard->children); + } + + return 0; +} + /* * All nested mount namespaces are restore as sub-trees of the root namespace. */ @@ -3135,54 +3219,36 @@ static int populate_roots_yard(void) static int populate_mnt_ns(void) { - struct mount_info *pms; - struct ns_id *nsid; int ret; - if (mnt_roots) { - /* mnt_roots is a tmpfs mount and it's private */ - root_yard_mp = mnt_entry_alloc(); - if (!root_yard_mp) - return -1; + root_yard_mp = mnt_entry_alloc(); + if (!root_yard_mp) + return -1; - root_yard_mp->mountpoint = mnt_roots; - root_yard_mp->mounted = true; - } + root_yard_mp->mountpoint = mnt_roots; + root_yard_mp->mounted = true; - pms = mnt_build_tree(mntinfo, root_yard_mp); - if (!pms) + if (merge_mount_trees(root_yard_mp)) return -1; #ifdef CONFIG_BINFMT_MISC_VIRTUALIZED if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { /* Add to mount tree. Generic code will mount it later */ - ret = add_cr_time_mount(pms, "binfmt_misc", BINFMT_MISC_HOME, 0); + ret = add_cr_time_mount(root_yard_mp, "binfmt_misc", BINFMT_MISC_HOME, 0); if (ret) return -1; } #endif - if (resolve_shared_mounts(mntinfo, pms->master_id)) + if (resolve_shared_mounts(mntinfo, 0)) return -1; - for (nsid = ns_ids; nsid; nsid = nsid->next) { - if (nsid->nd != &mnt_ns_desc) - continue; - - /* - * Make trees of all namespaces look the - * same, so that manual paths resolution - * works on them. - */ - nsid->mnt.mntinfo_tree = pms; - } - if (validate_mounts(mntinfo, false)) return -1; - mnt_tree_for_each(pms, set_is_overmounted); + mnt_tree_for_each(root_yard_mp, set_is_overmounted); - if (find_remap_mounts(pms)) + if (find_remap_mounts(root_yard_mp)) return -1; if (populate_roots_yard()) @@ -3191,8 +3257,8 @@ static int populate_mnt_ns(void) if (mount_clean_path()) return -1; - ret = mnt_tree_for_each(pms, do_mount_one); - mnt_tree_for_each(pms, do_close_one); + ret = mnt_tree_for_each(root_yard_mp, do_mount_one); + mnt_tree_for_each(root_yard_mp, do_close_one); if (ret == 0 && fixup_remap_mounts()) return -1; @@ -3680,27 +3746,38 @@ struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt"); static int call_helper_process(int (*call)(void *), void *arg) { - int pid, status; + int pid, status, exit_code = -1; + + /* + * Running new helper process on the restore must be + * done under last_pid mutex: other tasks may be restoring + * threads and the PID we need there might be occupied by + * this clone() call. + */ + lock_last_pid(); pid = clone_noasan(call, CLONE_VFORK | CLONE_VM | CLONE_FILES | CLONE_IO | CLONE_SIGHAND | CLONE_SYSVSEM, arg); if (pid == -1) { pr_perror("Can't clone helper process"); - return -1; + goto out; } errno = 0; if (waitpid(pid, &status, __WALL) != pid) { pr_perror("Unable to wait %d", pid); - return -1; + goto out; } if (status) { pr_err("Bad child exit status: %d\n", status); - return -1; + goto out; } - return 0; + exit_code = 0; +out: + unlock_last_pid(); + return exit_code; } static int ns_remount_writable(void *arg) @@ -3820,3 +3897,21 @@ int remount_readonly_mounts(void) */ return call_helper_process(ns_remount_readonly_mounts, NULL); } + +static struct mount_info *mnt_subtree_next(struct mount_info *mi, + struct mount_info *root) +{ + if (!list_empty(&mi->children)) + return list_entry(mi->children.next, + struct mount_info, siblings); + + while (mi->parent && mi != root) { + if (mi->siblings.next == &mi->parent->children) + mi = mi->parent; + else + return list_entry(mi->siblings.next, + struct mount_info, siblings); + } + + return NULL; +} diff --git a/criu/namespaces.c b/criu/namespaces.c index a228737ee8..21266df7c8 100644 --- a/criu/namespaces.c +++ b/criu/namespaces.c @@ -290,7 +290,7 @@ static void nsid_add(struct ns_id *ns, struct ns_desc *nd, unsigned int id, pid_ pr_info("Add %s ns %d pid %d\n", nd->str, ns->id, ns->ns_pid); } -struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, +static struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type type) { struct ns_id *nsid; @@ -976,7 +976,7 @@ int dump_user_ns(pid_t pid, int ns_id) return exit_code; } -void free_userns_maps() +void free_userns_maps(void) { if (userns_entry.n_uid_map > 0) { xfree(userns_entry.uid_map[0]); diff --git a/criu/net.c b/criu/net.c index 44b0ce2242..86fba2ddc5 100644 --- a/criu/net.c +++ b/criu/net.c @@ -17,6 +17,10 @@ #include #include +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +#include +#endif + #ifdef CONFIG_HAS_SELINUX #include #endif @@ -210,6 +214,19 @@ char *devconfs6[] = { #define MAX_CONF_OPT_PATH IFNAMSIZ+60 #define MAX_STR_CONF_LEN 200 +static const char *unix_conf_entries[] = { + "max_dgram_qlen", +}; + +/* + * MAX_CONF_UNIX_PATH = (sizeof(CONF_UNIX_FMT) - strlen("%s")) + * + MAX_CONF_UNIX_OPT_PATH + */ +#define CONF_UNIX_BASE "net/unix" +#define CONF_UNIX_FMT CONF_UNIX_BASE"/%s" +#define MAX_CONF_UNIX_OPT_PATH 32 +#define MAX_CONF_UNIX_PATH (sizeof(CONF_UNIX_FMT) + MAX_CONF_UNIX_OPT_PATH - 2) + static int net_conf_op(char *tgt, SysctlEntry **conf, int n, int op, char *proto, struct sysctl_req *req, char (*path)[MAX_CONF_OPT_PATH], int size, char **devconfs, SysctlEntry **def_conf) @@ -339,6 +356,72 @@ static int ipv6_conf_op(char *tgt, SysctlEntry **conf, int n, int op, SysctlEntr devconfs6, def_conf); } +static int unix_conf_op(SysctlEntry ***rconf, size_t *n, int op) +{ + int i, ret = -1, flags = 0; + char path[ARRAY_SIZE(unix_conf_entries)][MAX_CONF_UNIX_PATH] = { }; + struct sysctl_req req[ARRAY_SIZE(unix_conf_entries)] = { }; + SysctlEntry **conf = *rconf; + + if (*n != ARRAY_SIZE(unix_conf_entries)) { + pr_err("unix: Unexpected entries in config (%zu %zu)\n", + *n, ARRAY_SIZE(unix_conf_entries)); + return -EINVAL; + } + + if (opts.weak_sysctls || op == CTL_READ) + flags = CTL_FLAGS_OPTIONAL; + + for (i = 0; i < *n; i++) { + snprintf(path[i], MAX_CONF_UNIX_PATH, CONF_UNIX_FMT, + unix_conf_entries[i]); + req[i].name = path[i]; + req[i].flags = flags; + + switch (conf[i]->type) { + case SYSCTL_TYPE__CTL_32: + req[i].type = CTL_32; + req[i].arg = &conf[i]->iarg; + break; + default: + pr_err("unix: Unknown config type %d\n", + conf[i]->type); + return -1; + } + } + + ret = sysctl_op(req, *n, op, CLONE_NEWNET); + if (ret < 0) { + pr_err("unix: Failed to %s %s/\n", + (op == CTL_READ) ? "read" : "write", + CONF_UNIX_BASE); + return -1; + } + + if (op == CTL_READ) { + bool has_entries = false; + + for (i = 0; i < *n; i++) { + if (req[i].flags & CTL_FLAGS_HAS) { + conf[i]->has_iarg = true; + if (!has_entries) + has_entries = true; + } + } + + /* + * Zap the whole section of data. + * Unix conf is optional. + */ + if (!has_entries) { + *n = 0; + *rconf = NULL; + } + } + + return 0; +} + /* * I case if some entry is missing in * the kernel, simply write DEVCONFS_UNUSED @@ -1686,7 +1769,7 @@ static int __restore_links(struct ns_id *nsid, int *nrlinks, int *nrcreated) return 0; } -static int restore_links() +static int restore_links(void) { int nrcreated, nrlinks; struct ns_id *nsid; @@ -1818,12 +1901,63 @@ static inline int dump_iptables(struct cr_imgset *fds) return 0; } +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +static inline int dump_nftables(struct cr_imgset *fds) +{ + int ret = -1; + struct cr_img *img; + int img_fd; + FILE *fp; + struct nft_ctx *nft; + + nft = nft_ctx_new(NFT_CTX_DEFAULT); + if (!nft) + return -1; + + img = img_from_set(fds, CR_FD_NFTABLES); + img_fd = dup(img_raw_fd(img)); + if (img_fd < 0) { + pr_perror("dup() failed"); + goto nft_ctx_free_out; + } + + fp = fdopen(img_fd, "w"); + if (!fp) { + pr_perror("fdopen() failed"); + close(img_fd); + goto nft_ctx_free_out; + } + + nft_ctx_set_output(nft, fp); +#define DUMP_NFTABLES_CMD "list ruleset" +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) + if (nft_run_cmd_from_buffer(nft, DUMP_NFTABLES_CMD, strlen(DUMP_NFTABLES_CMD))) +#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) + if (nft_run_cmd_from_buffer(nft, DUMP_NFTABLES_CMD)) +#else + BUILD_BUG_ON(1); +#endif + goto fp_close_out; + + ret = 0; + +fp_close_out: + fclose(fp); +nft_ctx_free_out: + nft_ctx_free(nft); + + return ret; +} +#endif + static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) { void *buf, *o_buf; int ret = -1; int i; NetnsEntry netns = NETNS_ENTRY__INIT; + SysctlEntry *unix_confs = NULL; + size_t sizex = ARRAY_SIZE(unix_conf_entries); SysctlEntry *def_confs4 = NULL, *all_confs4 = NULL; int size4 = ARRAY_SIZE(devconfs4); SysctlEntry *def_confs6 = NULL, *all_confs6 = NULL; @@ -1840,7 +1974,8 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) o_buf = buf = xmalloc( i * (sizeof(NetnsId*) + sizeof(NetnsId)) + size4 * (sizeof(SysctlEntry*) + sizeof(SysctlEntry)) * 2 + - size6 * (sizeof(SysctlEntry*) + sizeof(SysctlEntry)) * 2 + size6 * (sizeof(SysctlEntry*) + sizeof(SysctlEntry)) * 2 + + sizex * (sizeof(SysctlEntry*) + sizeof(SysctlEntry)) ); if (!buf) goto out; @@ -1896,6 +2031,16 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) } } + netns.n_unix_conf = sizex; + netns.unix_conf = xptr_pull_s(&buf, sizex * sizeof(SysctlEntry*)); + unix_confs = xptr_pull_s(&buf, sizex * sizeof(SysctlEntry)); + + for (i = 0; i < sizex; i++) { + sysctl_entry__init(&unix_confs[i]); + netns.unix_conf[i] = &unix_confs[i]; + netns.unix_conf[i]->type = SYSCTL_TYPE__CTL_32; + } + ret = ipv4_conf_op("default", netns.def_conf4, size4, CTL_READ, NULL); if (ret < 0) goto err_free; @@ -1910,6 +2055,10 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) if (ret < 0) goto err_free; + ret = unix_conf_op(&netns.unix_conf, &netns.n_unix_conf, CTL_READ); + if (ret < 0) + goto err_free; + ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS); err_free: xfree(o_buf); @@ -1919,19 +2068,47 @@ static int dump_netns_conf(struct ns_id *ns, struct cr_imgset *fds) static int restore_ip_dump(int type, int pid, char *cmd) { - int ret = -1; + int ret = -1, sockfd, n, written; + FILE *tmp_file; struct cr_img *img; + char buf[1024]; img = open_image(type, O_RSTR, pid); if (empty_image(img)) { close_image(img); return 0; } + sockfd = img_raw_fd(img); + tmp_file = tmpfile(); + if (!tmp_file) { + pr_perror("Failed to open tmpfile"); + return -1; + } + + while ((n = read(sockfd, buf, 1024)) > 0) { + written = fwrite(buf, sizeof(char), n, tmp_file); + if (written < n) { + pr_perror("Failed to write to tmpfile " + "[written: %d; total: %d]", written, n); + goto close; + } + } + + if (fseek(tmp_file, 0, SEEK_SET)) { + pr_perror("Failed to set file position to beginning of tmpfile"); + goto close; + } + if (img) { - ret = run_ip_tool(cmd, "restore", NULL, NULL, img_raw_fd(img), -1, 0); + ret = run_ip_tool(cmd, "restore", NULL, NULL, fileno(tmp_file), -1, 0); close_image(img); } +close: + if(fclose(tmp_file)) { + pr_perror("Failed to close tmpfile"); + } + return ret; } @@ -1984,7 +2161,7 @@ static inline int restore_rule(int pid) * iptables-restore is executed from a target userns and it may have not enough * rights to open /run/xtables.lock. Here we try to workaround this problem. */ -static int prepare_xtable_lock() +static int prepare_xtable_lock(void) { int fd; @@ -2032,6 +2209,7 @@ static inline int restore_iptables(int pid) return -1; if (empty_image(img)) { ret = 0; + close_image(img); goto ipt6; } @@ -2053,10 +2231,67 @@ static inline int restore_iptables(int pid) return ret; } +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) +static inline int restore_nftables(int pid) +{ + int ret = -1; + struct cr_img *img; + struct nft_ctx *nft; + off_t img_data_size; + char *buf; + + img = open_image(CR_FD_NFTABLES, O_RSTR, pid); + if (img == NULL) + return -1; + if (empty_image(img)) { + /* Backward compatibility */ + pr_info("Skipping nft restore, no image"); + ret = 0; + goto image_close_out; + } + + if ((img_data_size = img_raw_size(img)) < 0) + goto image_close_out; + + if (read_img_str(img, &buf, img_data_size) < 0) + goto image_close_out; + + nft = nft_ctx_new(NFT_CTX_DEFAULT); + if (!nft) + goto buf_free_out; + + if (nft_ctx_buffer_output(nft) || nft_ctx_buffer_error(nft) || +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) + nft_run_cmd_from_buffer(nft, buf, strlen(buf))) +#elif defined(CONFIG_HAS_NFTABLES_LIB_API_1) + nft_run_cmd_from_buffer(nft, buf)) +#else + { + BUILD_BUG_ON(1); + } +#endif + goto nft_ctx_free_out; + + ret = 0; + +nft_ctx_free_out: + nft_ctx_free(nft); +buf_free_out: + xfree(buf); +image_close_out: + close_image(img); + + return ret; +} +#endif + int read_net_ns_img(void) { struct ns_id *ns; + if (!(root_ns_mask & CLONE_NEWNET)) + return 0; + for (ns = ns_ids; ns != NULL; ns = ns->next) { struct cr_img *img; int ret; @@ -2119,6 +2354,12 @@ static int restore_netns_conf(struct ns_id *ns) ret = ipv6_conf_op("default", (netns)->def_conf6, (netns)->n_def_conf6, CTL_WRITE, NULL); } + if ((netns)->unix_conf) { + ret = unix_conf_op(&(netns)->unix_conf, &(netns)->n_unix_conf, CTL_WRITE); + if (ret) + goto out; + } + ns->net.netns = netns; out: return ret; @@ -2130,6 +2371,11 @@ static int mount_ns_sysfs(void) BUG_ON(ns_sysfs_fd != -1); + if (kdat.has_fsopen) { + ns_sysfs_fd = mount_detached_fs("sysfs"); + return ns_sysfs_fd >= 0 ? 0 : -1; + } + /* * A new mntns is required to avoid the race between * open_detach_mount and creating mntns. @@ -2270,6 +2516,10 @@ int dump_net_ns(struct ns_id *ns) ret = dump_rule(fds); if (!ret) ret = dump_iptables(fds); +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + if (!ret) + ret = dump_nftables(fds); +#endif if (!ret) ret = dump_netns_conf(ns, fds); } else if (ns->type != NS_ROOT) { @@ -2363,6 +2613,10 @@ static int prepare_net_ns_second_stage(struct ns_id *ns) ret = restore_rule(nsid); if (!ret) ret = restore_iptables(nsid); +#if defined(CONFIG_HAS_NFTABLES_LIB_API_0) || defined(CONFIG_HAS_NFTABLES_LIB_API_1) + if (!ret) + ret = restore_nftables(nsid); +#endif } if (!ret) @@ -2590,7 +2844,7 @@ static int iptables_restore(bool ipv6, char *buf, int size) return ret; } -int network_lock_internal() +int network_lock_internal(void) { char conf[] = "*filter\n" ":CRIU - [0:0]\n" @@ -2621,7 +2875,7 @@ int network_lock_internal() return ret; } -static int network_unlock_internal() +static int network_unlock_internal(void) { char conf[] = "*filter\n" ":CRIU - [0:0]\n" @@ -2707,6 +2961,9 @@ int macvlan_ext_add(struct external *ext) static int prep_ns_sockets(struct ns_id *ns, bool for_dump) { int nsret = -1, ret; +#ifdef CONFIG_HAS_SELINUX + security_context_t ctx; +#endif if (ns->type != NS_CRIU) { pr_info("Switching to %d's net for collecting sockets\n", ns->ns_pid); @@ -2744,7 +3001,6 @@ static int prep_ns_sockets(struct ns_id *ns, bool for_dump) * policies installed. For Fedora based systems this is part * of the container-selinux package. */ - security_context_t ctx; /* * This assumes that all processes CRIU wants to dump are labeled @@ -3172,7 +3428,7 @@ static int check_link_nsid(int rtsk, void *args) return do_rtnl_req(rtsk, &req, sizeof(req), check_one_link_nsid, NULL, NULL, args); } -int kerndat_link_nsid() +int kerndat_link_nsid(void) { int status; pid_t pid; @@ -3184,6 +3440,7 @@ int kerndat_link_nsid() } if (pid == 0) { + bool has_link_nsid; NetDeviceEntry nde = NET_DEVICE_ENTRY__INIT; struct net_link link = { .created = false, @@ -3226,7 +3483,7 @@ int kerndat_link_nsid() exit(1); } - bool has_link_nsid = false; + has_link_nsid = false; if (check_link_nsid(sk, &has_link_nsid)) exit(1); diff --git a/criu/page-pipe.c b/criu/page-pipe.c index a8216962da..439c180e4f 100644 --- a/criu/page-pipe.c +++ b/criu/page-pipe.c @@ -54,8 +54,12 @@ static inline int ppb_resize_pipe(struct page_pipe_buf *ppb) if (ppb->pages_in + ppb->pipe_off < ppb->pipe_size) return 0; - if (new_size > PIPE_MAX_SIZE) - return 1; + if (new_size > PIPE_MAX_SIZE) { + if (ppb->pipe_size < PIPE_MAX_SIZE) + ppb->pipe_size = PIPE_MAX_SIZE; + else + return 1; + } ret = __ppb_resize_pipe(ppb, new_size); if (ret < 0) diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 75e135c662..c9b4f2fbc1 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -6,6 +6,7 @@ #include #include #include +#include #undef LOG_PREFIX #define LOG_PREFIX "page-xfer: " @@ -24,6 +25,7 @@ #include "parasite-syscall.h" #include "rst_info.h" #include "stats.h" +#include "img-remote.h" #include "tls.h" static int page_server_sk = -1; @@ -381,13 +383,29 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, unsigned lo int pfd; int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM; - pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); - if (pfd < 0 && errno == ENOENT) - goto out; + + if (opts.remote) { + /* Note: we are replacing a real directory FD for a snapshot_id + * index. Since we need the parent of the current snapshot_id, + * we want the current snapshot_id index minus one. It is + * possible that dfd is already a snapshot_id index. We test it + * by comparing it to the service FD. When opening an image (see + * do_open_image) we convert the snapshot_id index into a real + * snapshot_id. + */ + pfd = get_curr_snapshot_id_idx() - 1; + if (pfd < 0) + goto out; + } else { + pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); + if (pfd < 0 && errno == ENOENT) + goto out; + } xfer->parent = xmalloc(sizeof(*xfer->parent)); if (!xfer->parent) { - close(pfd); + if (!opts.remote) + close(pfd); return -1; } @@ -396,10 +414,12 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, unsigned lo pr_perror("No parent image found, though parent directory is set"); xfree(xfer->parent); xfer->parent = NULL; - close(pfd); + if (!opts.remote) + close(pfd); goto out; } - close(pfd); + if (!opts.remote) + close(pfd); } out: @@ -480,6 +500,402 @@ static inline u32 ppb_xfer_flags(struct page_xfer *xfer, struct page_pipe_buf *p return PE_PRESENT; } +/* + * Optimized pre-dump algorithm + * ============================== + * + * Note: Please refer man(2) page of process_vm_readv syscall. + * + * The following discussion covers the possibly faulty-iov + * locations in an iovec, which hinders process_vm_readv from + * dumping the entire iovec in a single invocation. + * + * Memory layout of target process: + * + * Pages: A B C + * +--------+--------+--------+--------+--------+--------+ + * ||||||||||||||||||||||||||||||||||||||||||||||||||||||| + * +--------+--------+--------+--------+--------+--------+ + * + * Single "iov" representation: {starting_address, length_in_bytes} + * An iovec is array of iov-s. + * + * NOTE: For easy representation and discussion purpose, we carry + * out further discussion at "page granularity". + * length_in_bytes will represent page count in iov instead + * of byte count. Same assumption applies for the syscall's + * return value. Instead of returning the number of bytes + * read, it returns a page count. + * + * For above memory mapping, generated iovec: {A,1}{B,1}{C,4} + * + * This iovec remains unmodified once generated. At the same + * time some of memory regions listed in iovec may get modified + * (unmap/change protection) by the target process while syscall + * is trying to dump iovec regions. + * + * Case 1: + * A is unmapped, {A,1} become faulty iov + * + * A B C + * +--------+--------+--------+--------+--------+--------+ + * | |||||||||||||||||||||||||||||||||||||||||||||| + * +--------+--------+--------+--------+--------+--------+ + * ^ ^ + * | | + * start | + * (1) | + * start + * (2) + * + * process_vm_readv will return -1. Increment start pointer(2), + * syscall will process {B,1}{C,4} in one go and copy 5 pages + * to userbuf from iov-B and iov-C. + * + * Case 2: + * B is unmapped, {B,1} become faulty iov + * + * A B C + * +--------+--------+--------+--------+--------+--------+ + * ||||||||| ||||||||||||||||||||||||||||||||||||| + * +--------+--------+--------+--------+--------+--------+ + * ^ ^ + * | | + * start | + * (1) | + * start + * (2) + * + * process_vm_readv will return 1, i.e. page A copied to + * userbuf successfully and syscall stopped, since B got + * unmapped. + * + * Increment the start pointer to C(2) and invoke syscall. + * Userbuf contains 5 pages overall from iov-A and iov-C. + * + * Case 3: + * This case deals with partial unmapping of iov representing + * more than one pagesize region. + * + * Syscall can't process such faulty iov as whole. So we + * process such regions part-by-part and form new sub-iovs + * in aux_iov from successfully processed pages. + * + * + * Part 3.1: + * First page of C is unmapped + * + * A B C + * +--------+--------+--------+--------+--------+--------+ + * |||||||||||||||||| |||||||||||||||||||||||||||| + * +--------+--------+--------+--------+--------+--------+ + * ^ ^ + * | | + * start | + * (1) | + * dummy + * (2) + * + * process_vm_readv will return 2, i.e. pages A and B copied. + * We identify length of iov-C is more than 1 page, that is + * where this case differs from Case 2. + * + * dummy-iov is introduced(2) as: {C+1,3}. dummy-iov can be + * directly placed at next page to failing page. This will copy + * remaining 3 pages from iov-C to userbuf. Finally create + * modified iov entry in aux_iov. Complete aux_iov look like: + * + * aux_iov: {A,1}{B,1}{C+1,3}* + * + * + * Part 3.2: + * In between page of C is unmapped, let's say third + * + * A B C + * +--------+--------+--------+--------+--------+--------+ + * |||||||||||||||||||||||||||||||||||| |||||||||| + * +--------+--------+--------+--------+--------+--------+ + * ^ ^ + * | |-----------------| | + * start partial_read_bytes | + * (1) | + * dummy + * (2) + * + * process_vm_readv will return 4, i.e. pages A and B copied + * completely and first two pages of C are also copied. + * + * Since, iov-C is not processed completely, we need to find + * "partial_read_byte" count to place out dummy-iov for + * remainig processing of iov-C. This function is performed by + * analyze_iov function. + * + * dummy-iov will be(2): {C+3,1}. dummy-iov will be placed + * next to first failing address to process remaining iov-C. + * New entries in aux_iov will look like: + * + * aux_iov: {A,1}{B,1}{C,2}*{C+3,1}* + */ + +unsigned long handle_faulty_iov(int pid, struct iovec* riov, + unsigned long faulty_index, + struct iovec *bufvec, struct iovec* aux_iov, + unsigned long* aux_len, + unsigned long partial_read_bytes) +{ + struct iovec dummy; + ssize_t bytes_read; + unsigned long offset = 0; + unsigned long final_read_cnt = 0; + + /* Handling Case 2*/ + if (riov[faulty_index].iov_len == PAGE_SIZE) { + cnt_sub(CNT_PAGES_WRITTEN, 1); + return 0; + } + + /* Handling Case 3-Part 3.2*/ + offset = (partial_read_bytes)? partial_read_bytes : PAGE_SIZE; + + dummy.iov_base = riov[faulty_index].iov_base + offset; + dummy.iov_len = riov[faulty_index].iov_len - offset; + + if (!partial_read_bytes) + cnt_sub(CNT_PAGES_WRITTEN, 1); + + while (dummy.iov_len) { + + bytes_read = process_vm_readv(pid, bufvec, 1, &dummy, 1, 0); + + if(bytes_read == -1) { + /* Handling faulty page read in faulty iov */ + cnt_sub(CNT_PAGES_WRITTEN, 1); + dummy.iov_base += PAGE_SIZE; + dummy.iov_len -= PAGE_SIZE; + continue; + } + + /* If aux-iov can merge and expand or new entry required */ + if (aux_iov[(*aux_len)-1].iov_base + + aux_iov[(*aux_len)-1].iov_len == dummy.iov_base) + aux_iov[(*aux_len)-1].iov_len += bytes_read; + else { + aux_iov[*aux_len].iov_base = dummy.iov_base; + aux_iov[*aux_len].iov_len = bytes_read; + (*aux_len) += 1; + } + + dummy.iov_base += bytes_read; + dummy.iov_len -= bytes_read; + bufvec->iov_base += bytes_read; + bufvec->iov_len -= bytes_read; + final_read_cnt += bytes_read; + } + + return final_read_cnt; +} + +/* + * This function will position start pointer to the latest + * successfully read iov in iovec. In case of partial read it + * returns partial_read_bytes, otherwise 0. + */ +static unsigned long analyze_iov(ssize_t bytes_read, struct iovec* riov, + unsigned long *index, struct iovec *aux_iov, + unsigned long *aux_len) +{ + ssize_t processed_bytes = 0; + unsigned long partial_read_bytes = 0; + + /* correlating iovs with read bytes */ + while (processed_bytes < bytes_read) { + + processed_bytes += riov[*index].iov_len; + aux_iov[*aux_len].iov_base = riov[*index].iov_base; + aux_iov[*aux_len].iov_len = riov[*index].iov_len; + + (*aux_len) += 1; + (*index) += 1; + } + + /* handling partially processed faulty iov*/ + if (processed_bytes - bytes_read) { + + (*index) -= 1; + + partial_read_bytes = riov[*index].iov_len + - (processed_bytes - bytes_read); + aux_iov[*aux_len-1].iov_len = partial_read_bytes; + } + + return partial_read_bytes; +} + +/* + * This function iterates over complete ppb->iov entries and pass + * them to process_vm_readv syscall. + * + * Since process_vm_readv returns count of successfully read bytes. + * It does not point to iovec entry associated to last successful + * byte read. The correlation between bytes read and corresponding + * iovec is setup through analyze_iov function. + * + * If all iovecs are not processed in one go, it means there exists + * some faulty iov entry(memory mapping modified after it was grabbed) + * in iovec. process_vm_readv syscall stops at such faulty iov and + * skip processing further any entry in iovec. This is handled by + * handle_faulty_iov function. + */ +static long fill_userbuf(int pid, struct page_pipe_buf *ppb, + struct iovec *bufvec, + struct iovec* aux_iov, + unsigned long *aux_len) +{ + struct iovec *riov = ppb->iov; + ssize_t bytes_read; + unsigned long total_read = 0; + unsigned long start = 0; + unsigned long partial_read_bytes = 0; + + while (start < ppb->nr_segs) { + + bytes_read = process_vm_readv(pid, bufvec, 1, &riov[start], + ppb->nr_segs - start, 0); + + if (bytes_read == -1) { + /* Handling Case 1*/ + if (riov[start].iov_len == PAGE_SIZE) { + cnt_sub(CNT_PAGES_WRITTEN, 1); + start += 1; + continue; + } else if (errno == ESRCH) { + pr_debug("Target process PID:%d not found\n", pid); + return ESRCH; + } + } + + partial_read_bytes = 0; + + if (bytes_read > 0) { + partial_read_bytes = analyze_iov(bytes_read, riov, + &start, aux_iov, + aux_len); + bufvec->iov_base += bytes_read; + bufvec->iov_len -= bytes_read; + total_read += bytes_read; + } + + /* + * If all iovs not processed in one go, + * it means some iov in between has failed. + */ + if (start < ppb->nr_segs) + total_read += handle_faulty_iov(pid, riov, start, bufvec, + aux_iov, aux_len, + partial_read_bytes); + + start += 1; + } + + return total_read; +} + +/* + * This function is similar to page_xfer_dump_pages, instead it uses + * auxiliary_iov array for pagemap generation. + * + * The entries of ppb->iov may mismatch with actual process mappings + * present at time of pre-dump. Such entries need to be adjusted as per + * the pages read by process_vm_readv syscall. These adjusted entries + * along with unmodified entries are present in aux_iov array. + */ + +int page_xfer_predump_pages(int pid, struct page_xfer *xfer, + struct page_pipe *pp) +{ + struct page_pipe_buf *ppb; + unsigned int cur_hole = 0, i; + unsigned long ret, bytes_read; + struct iovec bufvec; + + struct iovec aux_iov[PIPE_MAX_SIZE]; + unsigned long aux_len; + + char *userbuf = mmap(NULL, BUFFER_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if (userbuf == MAP_FAILED) { + pr_perror("Unable to mmap a buffer"); + return -1; + } + + list_for_each_entry(ppb, &pp->bufs, l) { + + timing_start(TIME_MEMDUMP); + + aux_len = 0; + bufvec.iov_len = BUFFER_SIZE; + bufvec.iov_base = userbuf; + + bytes_read = fill_userbuf(pid, ppb, &bufvec, aux_iov, &aux_len); + + if (bytes_read == ESRCH) { + munmap(userbuf, BUFFER_SIZE); + return -1; + } + + bufvec.iov_base = userbuf; + bufvec.iov_len = bytes_read; + ret = vmsplice(ppb->p[1], &bufvec, 1, SPLICE_F_NONBLOCK); + + if (ret == -1 || ret != bytes_read) { + pr_err("vmsplice: Failed to splice user buffer to pipe %ld\n", ret); + munmap(userbuf, BUFFER_SIZE); + return -1; + } + + timing_stop(TIME_MEMDUMP); + timing_start(TIME_MEMWRITE); + + /* generating pagemap */ + for (i = 0; i < aux_len; i++) { + + struct iovec iov = aux_iov[i]; + u32 flags; + + ret = dump_holes(xfer, pp, &cur_hole, iov.iov_base); + if (ret) { + munmap(userbuf, BUFFER_SIZE); + return ret; + } + + BUG_ON(iov.iov_base < (void *)xfer->offset); + iov.iov_base -= xfer->offset; + pr_debug("\t p %p [%u]\n", iov.iov_base, + (unsigned int)(iov.iov_len / PAGE_SIZE)); + + flags = ppb_xfer_flags(xfer, ppb); + + if (xfer->write_pagemap(xfer, &iov, flags)) { + munmap(userbuf, BUFFER_SIZE); + return -1; + } + + if (xfer->write_pages(xfer, ppb->p[0], iov.iov_len)) { + munmap(userbuf, BUFFER_SIZE); + return -1; + } + } + + timing_stop(TIME_MEMWRITE); + } + + munmap(userbuf, BUFFER_SIZE); + timing_start(TIME_MEMWRITE); + + return dump_holes(xfer, pp, &cur_hole, NULL); +} + int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp) { struct page_pipe_buf *ppb; @@ -531,6 +947,9 @@ int check_parent_local_xfer(int fd_type, unsigned long img_id) struct stat st; int ret, pfd; + if (opts.remote) + return get_curr_parent_snapshot_id_idx() == -1 ? 0 : 1; + pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); if (pfd < 0 && errno == ENOENT) return 0; @@ -1030,7 +1449,7 @@ int cr_page_server(bool daemon_mode, bool lazy_dump, int cfd) goto no_server; } - sk = setup_tcp_server("page", opts.addr, &opts.port); + sk = setup_tcp_server("page"); if (sk == -1) return -1; no_server: @@ -1076,7 +1495,7 @@ static int connect_to_page_server(void) goto out; } - page_server_sk = setup_tcp_client(opts.addr); + page_server_sk = setup_tcp_client(); if (page_server_sk == -1) return -1; diff --git a/criu/pagemap.c b/criu/pagemap.c index 05f6b82b8e..3c3930b7b5 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -18,6 +18,7 @@ #include "xmalloc.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" +#include "img-remote.h" #ifndef SEEK_DATA #define SEEK_DATA 3 @@ -143,8 +144,12 @@ static void skip_pagemap_pages(struct page_read *pr, unsigned long len) if (!len) return; - if (pagemap_present(pr->pe)) + if (pagemap_present(pr->pe)) { + if (opts.remote) + if (skip_remote_bytes(img_raw_fd(pr->pi), len)) + pr_perror("Error skipping remote bytes"); pr->pi_off += len; + } pr->cvaddr += len; } @@ -161,7 +166,7 @@ static int seek_pagemap(struct page_read *pr, unsigned long vaddr) break; if (vaddr >= start && vaddr < end) { - skip_pagemap_pages(pr, vaddr - pr->cvaddr); + skip_pagemap_pages(pr, vaddr > pr->cvaddr ? vaddr - pr->cvaddr : 0); return 1; } @@ -244,7 +249,6 @@ static int read_local_page(struct page_read *pr, unsigned long vaddr, { int fd = img_raw_fd(pr->pi); ssize_t ret; - size_t curr = 0; /* * Flush any pending async requests if any not to break the @@ -254,15 +258,10 @@ static int read_local_page(struct page_read *pr, unsigned long vaddr, return -1; pr_debug("\tpr%lu-%u Read page from self %lx/%"PRIx64"\n", pr->img_id, pr->id, pr->cvaddr, pr->pi_off); - while (1) { - ret = pread(fd, buf + curr, len - curr, pr->pi_off + curr); - if (ret < 1) { - pr_perror("Can't read mapping page %zd", ret); - return -1; - } - curr += ret; - if (curr == len) - break; + ret = pread(fd, buf, len, pr->pi_off); + if (ret != len) { + pr_perror("Can't read mapping page %zd", ret); + return -1; } if (opts.auto_dedup) { @@ -406,6 +405,37 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, return ret; } +static int maybe_read_page_img_cache(struct page_read *pr, unsigned long vaddr, + int nr, void *buf, unsigned flags) +{ + unsigned long len = nr * PAGE_SIZE; + int fd = img_raw_fd(pr->pi); + int ret; + size_t curr = 0; + + pr_debug("\tpr%lu-%u Read page from self %lx/%"PRIx64"\n", pr->img_id, pr->id, pr->cvaddr, pr->pi_off); + while (1) { + ret = read(fd, buf + curr, len - curr); + if (ret < 0) { + pr_perror("Can't read mapping page %d", ret); + return -1; + } + curr += ret; + if (curr == len) + break; + } + + if (opts.auto_dedup) + pr_warn_once("Can't dedup from image cache\n"); + + if (ret == 0 && pr->io_complete) + ret = pr->io_complete(pr, vaddr, nr); + + pr->pi_off += len; + + return ret; +} + static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) { int ret = 0; @@ -419,6 +449,8 @@ static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_ if (pr->io_complete) ret = pr->io_complete(pr, vaddr, nr_pages); + else + pr_warn_once("Remote page read w/o io_complete!\n"); return ret; } @@ -601,9 +633,24 @@ static int try_open_parent(int dfd, unsigned long id, struct page_read *pr, int int pfd, ret; struct page_read *parent = NULL; - pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY); - if (pfd < 0 && errno == ENOENT) - goto out; + if (opts.remote) { + /* Note: we are replacing a real directory FD for a snapshot_id + * index. Since we need the parent of the current snapshot_id, + * we want the current snapshot_id index minus one. It is + * possible that dfd is already a snapshot_id index. We test it + * by comparing it to the service FD. When opening an image (see + * do_open_image) we convert the snapshot_id index into a real + * snapshot_id. + */ + pfd = dfd == get_service_fd(IMG_FD_OFF) ? + get_curr_snapshot_id_idx() - 1 : dfd - 1; + if (pfd < 0) + goto out; + } else { + pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY); + if (pfd < 0 && errno == ENOENT) + goto out; + } parent = xmalloc(sizeof(*parent)); if (!parent) @@ -618,7 +665,8 @@ static int try_open_parent(int dfd, unsigned long id, struct page_read *pr, int parent = NULL; } - close(pfd); + if (!opts.remote) + close(pfd); out: pr->parent = parent; return 0; @@ -626,7 +674,8 @@ static int try_open_parent(int dfd, unsigned long id, struct page_read *pr, int err_free: xfree(parent); err_cl: - close(pfd); + if (!opts.remote) + close(pfd); return -1; } @@ -657,7 +706,18 @@ static int init_pagemaps(struct page_read *pr) off_t fsize; int nr_pmes, nr_realloc; - fsize = img_raw_size(pr->pmi); + if (!opts.remote) + fsize = img_raw_size(pr->pmi); + else + /* + * FIXME - There is no easy way to estimate the size of the + * pagemap that is still to be read from the socket. Possible + * solution is to ask Image Proxy or Image Cache about the size + * of the image. 1024 is a wild guess (more space is allocated + * if needed). + */ + fsize = 1024; + if (fsize < 0) return -1; @@ -779,7 +839,9 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p pr->id = ids++; pr->img_id = img_id; - if (remote) + if (opts.remote) + pr->maybe_read_page = maybe_read_page_img_cache; + else if (remote) pr->maybe_read_page = maybe_read_page_remote; else { pr->maybe_read_page = maybe_read_page_local; diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c index b9788a4c28..b649d1b51a 100644 --- a/criu/parasite-syscall.c +++ b/criu/parasite-syscall.c @@ -45,8 +45,6 @@ #include "infect-rpc.h" #include "pie/parasite-blob.h" -#include - unsigned long get_exec_start(struct vm_area_list *vmas) { struct vma_area *vma_area; @@ -565,7 +563,8 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, parasite_ensure_args_size(aio_rings_args_size(vma_area_list)); if (compel_infect(ctl, item->nr_threads, parasite_args_size) < 0) { - compel_cure(ctl); + if (compel_cure(ctl)) + pr_warn("Can't cure failed infection\n"); return NULL; } diff --git a/criu/pie/Makefile b/criu/pie/Makefile index 1ad456f430..a30747ac30 100644 --- a/criu/pie/Makefile +++ b/criu/pie/Makefile @@ -14,7 +14,7 @@ ifneq ($(filter-out clean mrproper,$(MAKECMDGOALS)),) compel_plugins := $(shell $(COMPEL_BIN) plugins) endif -LDS := compel/arch/$(SRCARCH)/scripts/compel-pack.lds.S +LDS := compel/arch/$(ARCH)/scripts/compel-pack.lds.S restorer-obj-y += parasite-vdso.o ./$(ARCH_DIR)/vdso-pie.o restorer-obj-y += ./$(ARCH_DIR)/restorer.o @@ -26,11 +26,11 @@ ifeq ($(ARCH),x86) endif endif -ifeq ($(SRCARCH),aarch64) +ifeq ($(ARCH),aarch64) restorer-obj-y += ./$(ARCH_DIR)/intraprocedure.o endif -ifeq ($(SRCARCH),ppc64) +ifeq ($(ARCH),ppc64) restorer-obj-y += ./$(ARCH_DIR)/vdso-trampoline.o endif diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library index 658c8a4eb8..de75b11d46 100644 --- a/criu/pie/Makefile.library +++ b/criu/pie/Makefile.library @@ -9,14 +9,14 @@ lib-name := pie.lib.a lib-y += util.o lib-y += util-vdso.o -ifeq ($(SRCARCH),x86) +ifeq ($(ARCH),x86) ifeq ($(CONFIG_COMPAT),y) lib-y += util-vdso-elf32.o endif CFLAGS_util-vdso-elf32.o += -DCONFIG_VDSO_32 endif -ifeq ($(SRCARCH),arm) +ifeq ($(ARCH),arm) lib-y += ./$(ARCH_DIR)/aeabi-helpers.o lib-y += ./$(ARCH_DIR)/pie-cacheflush.o endif diff --git a/criu/pie/parasite-vdso.c b/criu/pie/parasite-vdso.c index 38da766804..3f5cb14312 100644 --- a/criu/pie/parasite-vdso.c +++ b/criu/pie/parasite-vdso.c @@ -119,9 +119,9 @@ int vdso_do_park(struct vdso_maps *rt, unsigned long addr, unsigned long space) BUG_ON((vdso_size + vvar_size) < space); if (rt->sym.vdso_before_vvar) - return park_at(rt, addr, addr + vvar_size); + return park_at(rt, addr, addr + vdso_size); else - return park_at(rt, addr + vdso_size, addr); + return park_at(rt, addr + vvar_size, addr); } #ifndef CONFIG_COMPAT @@ -292,6 +292,18 @@ int vdso_proxify(struct vdso_maps *rt, bool *added_proxy, return -1; } + /* + * We could still do something about it here.. + * 1. Hope that vDSO from images still works (might not be the case). + * 2. Try to map vDSO. + * But, hopefully no one intends to migrate application that uses + * vDSO to a dut where kernel doesn't provide it. + */ + if (!vdso_is_present(rt)) { + pr_err("vDSO isn't provided by kernel, but exists in images\n"); + return -1; + } + /* * vDSO mark overwrites Elf program header of proxy vDSO thus * it must never ever be greater in size. diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c index 387a976da0..64b5bbb3e2 100644 --- a/criu/pie/parasite.c +++ b/criu/pie/parasite.c @@ -317,15 +317,60 @@ static int dump_creds(struct parasite_dump_creds *args) return -1; } +static int fill_fds_fown(int fd, struct fd_opts *p) +{ + int flags, ret; + struct f_owner_ex owner_ex; + uint32_t v[2]; + + /* + * For O_PATH opened files there is no owner at all. + */ + flags = sys_fcntl(fd, F_GETFL, 0); + if (flags < 0) { + pr_err("fcntl(%d, F_GETFL) -> %d\n", fd, flags); + return -1; + } + if (flags & O_PATH) { + p->fown.pid = 0; + return 0; + } + + ret = sys_fcntl(fd, F_GETOWN_EX, (long)&owner_ex); + if (ret) { + pr_err("fcntl(%d, F_GETOWN_EX) -> %d\n", fd, ret); + return -1; + } + + /* + * Simple case -- nothing is changed. + */ + if (owner_ex.pid == 0) { + p->fown.pid = 0; + return 0; + } + + ret = sys_fcntl(fd, F_GETOWNER_UIDS, (long)&v); + if (ret) { + pr_err("fcntl(%d, F_GETOWNER_UIDS) -> %d\n", fd, ret); + return -1; + } + + p->fown.uid = v[0]; + p->fown.euid = v[1]; + p->fown.pid_type = owner_ex.type; + p->fown.pid = owner_ex.pid; + + return 0; +} + static int fill_fds_opts(struct parasite_drain_fd *fds, struct fd_opts *opts) { int i; for (i = 0; i < fds->nr_fds; i++) { - int flags, fd = fds->fds[i], ret; + int flags, fd = fds->fds[i]; struct fd_opts *p = opts + i; - struct f_owner_ex owner_ex; - uint32_t v[2]; flags = sys_fcntl(fd, F_GETFD, 0); if (flags < 0) { @@ -335,30 +380,8 @@ static int fill_fds_opts(struct parasite_drain_fd *fds, struct fd_opts *opts) p->flags = (char)flags; - ret = sys_fcntl(fd, F_GETOWN_EX, (long)&owner_ex); - if (ret) { - pr_err("fcntl(%d, F_GETOWN_EX) -> %d\n", fd, ret); - return -1; - } - - /* - * Simple case -- nothing is changed. - */ - if (owner_ex.pid == 0) { - p->fown.pid = 0; - continue; - } - - ret = sys_fcntl(fd, F_GETOWNER_UIDS, (long)&v); - if (ret) { - pr_err("fcntl(%d, F_GETOWNER_UIDS) -> %d\n", fd, ret); + if (fill_fds_fown(fd, p)) return -1; - } - - p->fown.uid = v[0]; - p->fown.euid = v[1]; - p->fown.pid_type = owner_ex.type; - p->fown.pid = owner_ex.pid; } return 0; diff --git a/criu/pie/pie-relocs.h b/criu/pie/pie-relocs.h index 6797486c2d..e36126be60 100644 --- a/criu/pie/pie-relocs.h +++ b/criu/pie/pie-relocs.h @@ -1,8 +1,6 @@ #ifndef __PIE_RELOCS_H__ #define __PIE_RELOCS_H__ -#include - #include "common/config.h" #include "common/compiler.h" diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index 390c0e1a9a..afe185f048 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -35,6 +35,7 @@ #include "sk-inet.h" #include "vma.h" #include "uffd.h" +#include "sched.h" #include "common/lock.h" #include "common/page.h" @@ -1320,21 +1321,23 @@ static int fd_poll(int inotify_fd) } /* - * note: Actually kernel may want even more space for one event (see - * round_event_name_len), so using buffer of EVENT_BUFF_SIZE size may fail. - * To be on the safe side - take a bigger buffer, and these also allows to - * read more events in one syscall. + * In the worst case buf size should be: + * sizeof(struct inotify_event) * 2 + PATH_MAX + * See round_event_name_len() in kernel. */ -#define EVENT_BUFF_SIZE ((sizeof(struct inotify_event) + PATH_MAX)) +#define EVENT_BUFF_SIZE ((sizeof(struct inotify_event) * 2 + PATH_MAX)) /* * Read all available events from inotify queue */ static int cleanup_inotify_events(int inotify_fd) { - char buf[EVENT_BUFF_SIZE * 8]; + char buf[EVENT_BUFF_SIZE * 3]; int ret; + /* Limit buf to be lesser than half of restorer's stack */ + BUILD_BUG_ON(ARRAY_SIZE(buf) >= RESTORE_STACK_SIZE/2); + while (1) { ret = fd_poll(inotify_fd); if (ret < 0) { @@ -1451,7 +1454,7 @@ long __export_restore_task(struct task_restore_args *args) * it's presence in original task: vdso will be used for fast * getttimeofday() in restorer's log timings. */ - if (!args->can_map_vdso) { + if (!args->can_map_vdso && vdso_is_present(&args->vdso_maps_rt)) { /* It's already checked in kdat, but let's check again */ if (args->compatible_mode) { pr_err("Compatible mode without vdso map support\n"); @@ -1769,16 +1772,19 @@ long __export_restore_task(struct task_restore_args *args) long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD | CLONE_SYSVSEM | CLONE_FS; long last_pid_len; + pid_t thread_pid; long parent_tid; int i, fd = -1; - /* One level pid ns hierarhy */ - fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); - if (fd < 0) { - pr_err("can't open last pid fd %d\n", fd); - goto core_restore_end; - } + if (!args->has_clone3_set_tid) { + /* One level pid ns hierarhy */ + fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); + if (fd < 0) { + pr_err("can't open last pid fd %d\n", fd); + goto core_restore_end; + } + } mutex_lock(&task_entries_local->last_pid_mutex); for (i = 0; i < args->nr_threads; i++) { @@ -1789,24 +1795,38 @@ long __export_restore_task(struct task_restore_args *args) continue; new_sp = restorer_stack(thread_args[i].mz); - last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); - sys_lseek(fd, 0, SEEK_SET); - ret = sys_write(fd, s, last_pid_len); - if (ret < 0) { - pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf); - sys_close(fd); - mutex_unlock(&task_entries_local->last_pid_mutex); - goto core_restore_end; - } - - /* - * To achieve functionality like libc's clone() - * we need a pure assembly here, because clone()'ed - * thread will run with own stack and we must not - * have any additional instructions... oh, dear... - */ + if (args->has_clone3_set_tid) { + struct _clone_args c_args = {}; + thread_pid = thread_args[i].pid; + c_args.set_tid = ptr_to_u64(&thread_pid); + c_args.flags = clone_flags; + c_args.set_tid_size = 1; + /* The kernel does stack + stack_size. */ + c_args.stack = new_sp - RESTORE_STACK_SIZE; + c_args.stack_size = RESTORE_STACK_SIZE; + c_args.child_tid = ptr_to_u64(&thread_args[i].pid); + c_args.parent_tid = ptr_to_u64(&parent_tid); + pr_debug("Using clone3 to restore the process\n"); + RUN_CLONE3_RESTORE_FN(ret, c_args, sizeof(c_args), &thread_args[i], args->clone_restore_fn); + } else { + last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); + sys_lseek(fd, 0, SEEK_SET); + ret = sys_write(fd, s, last_pid_len); + if (ret < 0) { + pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf); + sys_close(fd); + mutex_unlock(&task_entries_local->last_pid_mutex); + goto core_restore_end; + } - RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); + /* + * To achieve functionality like libc's clone() + * we need a pure assembly here, because clone()'ed + * thread will run with own stack and we must not + * have any additional instructions... oh, dear... + */ + RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); + } if (ret != thread_args[i].pid) { pr_err("Unable to create a thread: %ld\n", ret); mutex_unlock(&task_entries_local->last_pid_mutex); @@ -1837,9 +1857,6 @@ long __export_restore_task(struct task_restore_args *args) restore_finish_stage(task_entries_local, CR_STATE_RESTORE); - if (cleanup_current_inotify_events(args)) - goto core_restore_end; - if (wait_helpers(args) < 0) goto core_restore_end; if (wait_zombies(args) < 0) @@ -1852,6 +1869,9 @@ long __export_restore_task(struct task_restore_args *args) goto core_restore_end; } + if (cleanup_current_inotify_events(args)) + goto core_restore_end; + if (!args->compatible_mode) { ret = sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t)); diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c index 104da06332..58b27680c8 100644 --- a/criu/pie/util-vdso.c +++ b/criu/pie/util-vdso.c @@ -243,10 +243,11 @@ static void parse_elf_symbols(uintptr_t mem, size_t size, Phdr_t *load, k = elf_hash((const unsigned char *)symbol); for (j = bucket[k % nbucket]; j < nchain && j != STN_UNDEF; j = chain[j]) { - addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; Sym_t *sym; char *name; + addr = mem + dyn_symtab->d_un.d_ptr - load->p_vaddr; + addr += sizeof(Sym_t)*j; if (__ptr_struct_oob(addr, sizeof(Sym_t), mem, size)) continue; diff --git a/criu/pipes.c b/criu/pipes.c index fd1a7e6bb2..d74329161b 100644 --- a/criu/pipes.c +++ b/criu/pipes.c @@ -160,24 +160,24 @@ int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash return 0; } - if (!pd->pde->bytes) - goto out; - - if (!pd->data) { - pr_err("Double data restore occurred on %#x\n", id); - return -1; - } - if (pd->pde->has_size) { pr_info("Restoring size %#x for %#x\n", pd->pde->size, pd->pde->pipe_id); ret = fcntl(pfd, F_SETPIPE_SZ, pd->pde->size); if (ret < 0) { pr_perror("Can't restore pipe size"); - goto err; + return -1; } } + if (!pd->pde->bytes) + return 0; + + if (!pd->data) { + pr_err("Double data restore occurred on %#x\n", id); + return -1; + } + iov.iov_base = pd->data; iov.iov_len = pd->pde->bytes; @@ -185,14 +185,13 @@ int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash ret = vmsplice(pfd, &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK); if (ret < 0) { pr_perror("%#x: Error splicing data", id); - goto err; + return -1; } if (ret == 0 || ret > iov.iov_len /* sanity */) { pr_err("%#x: Wanted to restore %zu bytes, but got %d\n", id, iov.iov_len, ret); - ret = -1; - goto err; + return -1; } iov.iov_base += ret; @@ -211,10 +210,7 @@ int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash munmap(pd->data, pd->pde->bytes); pd->data = NULL; -out: - ret = 0; -err: - return ret; + return 0; } static int userns_reopen(void *_arg, int fd, pid_t pid) @@ -282,8 +278,8 @@ static char *pipe_d_name(struct file_desc *d, char *buf, size_t s) struct pipe_info *pi; pi = container_of(d, struct pipe_info, d); - if (snprintf(buf, s, "pipe:[%d]", pi->pe->pipe_id) >= s) { - pr_err("Not enough room for pipe %d identifier string\n", + if (snprintf(buf, s, "pipe:[%u]", pi->pe->pipe_id) >= s) { + pr_err("Not enough room for pipe %u identifier string\n", pi->pe->pipe_id); return NULL; } diff --git a/criu/proc_parse.c b/criu/proc_parse.c index 0e8b6f209f..60aba87887 100644 --- a/criu/proc_parse.c +++ b/criu/proc_parse.c @@ -41,6 +41,7 @@ #include "timerfd.h" #include "path.h" #include "fault-injection.h" +#include "memfd.h" #include "protobuf.h" #include "images/fdinfo.pb-c.h" @@ -303,6 +304,26 @@ static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, } vfi_dev = makedev(vfi->dev_maj, vfi->dev_min); + + if (is_memfd(vfi_dev)) { + struct fd_link link; + link.len = strlen(fname); + strlcpy(link.name, fname, sizeof(link.name)); + strip_deleted(&link); + + /* + * The error EPERM will be shown in the following pr_perror(). + * It comes from the previous open() call. + */ + pr_perror("Can't open mapped [%s]", link.name); + + /* + * TODO Perhaps we could do better than failing and dump the + * memory like what is being done in shmem.c + */ + return -1; + } + if (is_anon_shmem_map(vfi_dev)) { if (!(vma->e->flags & MAP_SHARED)) return -1; @@ -563,6 +584,14 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, vma_area->e->shmid = prev->e->shmid; vma_area->vmst = prev->vmst; vma_area->mnt_id = prev->mnt_id; + + if (!(vma_area->e->status & VMA_AREA_SYSVIPC)) { + vma_area->e->status &= ~(VMA_FILE_PRIVATE | VMA_FILE_SHARED); + if (vma_area->e->flags & MAP_PRIVATE) + vma_area->e->status |= VMA_FILE_PRIVATE; + else + vma_area->e->status |= VMA_FILE_SHARED; + } } else if (*vm_file_fd >= 0) { struct stat *st_buf = vma_area->vmst; @@ -575,25 +604,21 @@ static int handle_vma(pid_t pid, struct vma_area *vma_area, goto err; } - /* - * /dev/zero stands for anon-shared mapping - * otherwise it's some file mapping. - */ - if (is_anon_shmem_map(st_buf->st_dev)) { - if (!(vma_area->e->flags & MAP_SHARED)) - goto err_bogus_mapping; + if (is_anon_shmem_map(st_buf->st_dev) && !strncmp(file_path, "/SYSV", 5)) { vma_area->e->flags |= MAP_ANONYMOUS; vma_area->e->status |= VMA_ANON_SHARED; vma_area->e->shmid = st_buf->st_ino; - - if (!strncmp(file_path, "/SYSV", 5)) { - pr_info("path: %s\n", file_path); - vma_area->e->status |= VMA_AREA_SYSVIPC; - } else { + if (!(vma_area->e->flags & MAP_SHARED)) + goto err_bogus_mapping; + pr_info("path: %s\n", file_path); + vma_area->e->status |= VMA_AREA_SYSVIPC; + } else { + if (is_anon_shmem_map(st_buf->st_dev)) { + vma_area->e->status |= VMA_AREA_MEMFD; if (fault_injected(FI_HUGE_ANON_SHMEM_ID)) vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE; } - } else { + if (vma_area->e->flags & MAP_PRIVATE) vma_area->e->status |= VMA_FILE_PRIVATE; else @@ -932,7 +957,7 @@ int prepare_loginuid(unsigned int value, unsigned int loglevel) if (write(fd, buf, 11) < 0) { print_on_level(loglevel, - "Write %s to /proc/self/loginuid failed: %s", + "Write %s to /proc/self/loginuid failed: %s\n", buf, strerror(errno)); ret = -1; } @@ -1669,17 +1694,27 @@ static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg) if (fdinfo_field(str, "lock")) { struct file_lock *fl; struct fdinfo_common *fdinfo = arg; + char *flock_status = str+sizeof("lock:\t")-1; if (type != FD_TYPES__UND) continue; + /* + * The lock status can be empty when the owner of the + * lock is invisible from our PID namespace. + * This unfortunate behavior is fixed in kernels v4.19 + * and up (see commit 1cf8e5de40). + */ + if (flock_status[0] == '\0') + continue; + fl = alloc_file_lock(); if (!fl) { pr_perror("Alloc file lock failed!"); goto out; } - if (parse_file_lock_buf(str + 6, fl, 0)) { + if (parse_file_lock_buf(flock_status, fl, 0)) { xfree(fl); goto parse_err; } @@ -2488,6 +2523,12 @@ int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups) goto err; } *off = '\0'; + + if (cgp_should_skip_controller(controllers)) { + pr_debug("cg-prop: Skipping controller %s\n", controllers); + continue; + } + while (1) { off = strchr(controllers, ','); if (off) diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index 41c2080372..bfe00c561a 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -62,6 +62,7 @@ #include "images/seccomp.pb-c.h" #include "images/binfmt-misc.pb-c.h" #include "images/autofs.pb-c.h" +#include "images/remote-image.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; diff --git a/criu/protobuf.c b/criu/protobuf.c index 8eb73e0198..e68d42b5ca 100644 --- a/criu/protobuf.c +++ b/criu/protobuf.c @@ -20,13 +20,6 @@ #include "protobuf.h" #include "util.h" -/* - * To speed up reading of packed objects - * by providing space on stack, this should - * be more than enough for most objects. - */ -#define PB_PKOBJ_LOCAL_SIZE 1024 - static char *image_name(struct cr_img *img) { int fd = img->_x.fd; diff --git a/criu/pstree.c b/criu/pstree.c index 92b4167aae..19cf5ad381 100644 --- a/criu/pstree.c +++ b/criu/pstree.c @@ -608,7 +608,7 @@ static int read_pstree_image(pid_t *pid_max) } #define RESERVED_PIDS 300 -static int get_free_pid() +static int get_free_pid(void) { static struct pid *prev, *next; diff --git a/criu/seize.c b/criu/seize.c index cce8911b92..f973806d99 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -20,9 +20,9 @@ #include "seccomp.h" #include "seize.h" #include "stats.h" +#include "string.h" #include "xmalloc.h" #include "util.h" -#include #define NR_ATTEMPTS 5 @@ -30,7 +30,17 @@ static const char frozen[] = "FROZEN"; static const char freezing[] = "FREEZING"; static const char thawed[] = "THAWED"; -static const char *get_freezer_state(int fd) +enum freezer_state { + FREEZER_ERROR = -1, + THAWED, + FROZEN, + FREEZING +}; + +/* Track if we are running on cgroup v2 system. */ +static bool cgroup_v2 = false; + +static enum freezer_state get_freezer_v1_state(int fd) { char state[32]; int ret; @@ -52,15 +62,79 @@ static const char *get_freezer_state(int fd) pr_debug("freezer.state=%s\n", state); if (strcmp(state, frozen) == 0) - return frozen; + return FROZEN; else if (strcmp(state, freezing) == 0) - return freezing; + return FREEZING; else if (strcmp(state, thawed) == 0) - return thawed; + return THAWED; pr_err("Unknown freezer state: %s\n", state); err: - return NULL; + return FREEZER_ERROR; +} + +static enum freezer_state get_freezer_v2_state(int fd) +{ + int exit_code = FREEZER_ERROR; + char path[PATH_MAX]; + FILE *event; + char state; + int ret; + + /* + * cgroupv2 freezer uses cgroup.freeze to control the state. The file + * can return 0 or 1. 1 means the cgroup is frozen; 0 means it is not + * frozen. Writing 1 to an unfrozen cgroup can freeze it. Freezing can + * take some time and if the cgroup has finished freezing can be + * seen in cgroup.events: frozen 0|1. + */ + + ret = lseek(fd, 0, SEEK_SET); + if (ret < 0) { + pr_perror("Unable to seek freezer FD"); + goto out; + } + ret = read(fd, &state, 1); + if (ret <= 0) { + pr_perror("Unable to read from freezer FD"); + goto out; + } + pr_debug("cgroup.freeze=%c\n", state); + if (state == '0') { + exit_code = THAWED; + goto out; + } + + snprintf(path, sizeof(path), "%s/cgroup.events", opts.freeze_cgroup); + event = fopen(path, "r"); + if (event == NULL) { + pr_perror("Unable to open %s", path); + goto out; + } + while (fgets(path, sizeof(path), event)) { + if (strncmp(path, "frozen", 6) != 0) { + continue; + } else if (strncmp(path, "frozen 0", 8) == 0) { + exit_code = FREEZING; + goto close; + } else if (strncmp(path, "frozen 1", 8) == 0) { + exit_code = FROZEN; + goto close; + } + } + + pr_err("Unknown freezer state: %c\n", state); +close: + fclose(event); +out: + return exit_code; +} + +static enum freezer_state get_freezer_state(int fd) +{ + if (cgroup_v2) + return get_freezer_v2_state(fd); + return get_freezer_v1_state(fd); } static bool freezer_thawed; @@ -70,35 +144,99 @@ const char *get_real_freezer_state(void) return freezer_thawed ? thawed : frozen; } -static int freezer_restore_state(void) +static int freezer_write_state(int fd, enum freezer_state new_state) { - int fd; - char path[PATH_MAX]; + char state[32] = {0}; + int ret; - if (!opts.freeze_cgroup || freezer_thawed) - return 0; + if (new_state == THAWED) { + if (cgroup_v2) + state[0] = '0'; + else + if (strlcpy(state, thawed, sizeof(state)) >= + sizeof(state)) + return -1; + } else if (new_state == FROZEN) { + if (cgroup_v2) + state[0] = '1'; + else + if (strlcpy(state, frozen, sizeof(state)) >= + sizeof(state)) + return -1; + } else { + return -1; + } - snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup); + ret = lseek(fd, 0, SEEK_SET); + if (ret < 0) { + pr_perror("Unable to seek freezer FD"); + return -1; + } + if (write(fd, state, sizeof(state)) != sizeof(state)) { + pr_perror("Unable to %s tasks", + (new_state == THAWED) ? "thaw" : "freeze"); + return -1; + } + + return 0; +} + +static int freezer_open(void) +{ + const char freezer_v1[] = "freezer.state"; + const char freezer_v2[] = "cgroup.freeze"; + char path[PATH_MAX]; + int fd; + + snprintf(path, sizeof(path), "%s/%s", opts.freeze_cgroup, + cgroup_v2 ? freezer_v2 : freezer_v1); fd = open(path, O_RDWR); if (fd < 0) { pr_perror("Unable to open %s", path); return -1; } - if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) { - pr_perror("Unable to freeze tasks"); - close(fd); + return fd; +} + +static int freezer_restore_state(void) +{ + int fd; + int ret; + + if (!opts.freeze_cgroup || freezer_thawed) + return 0; + + fd = freezer_open(); + if (fd < 0) return -1; - } + + ret = freezer_write_state(fd, FROZEN); close(fd); - return 0; + return ret; +} + +static FILE *freezer_open_thread_list(char *root_path) +{ + char path[PATH_MAX]; + FILE *f; + + snprintf(path, sizeof(path), "%s/%s", root_path, + cgroup_v2 ? "cgroup.threads" : "tasks"); + f = fopen(path, "r"); + if (f == NULL) { + pr_perror("Unable to open %s", path); + return NULL; + } + + return f; } /* A number of tasks in a freezer cgroup which are not going to be dumped */ static int processes_to_wait; static pid_t *processes_to_wait_pids; -static int seize_cgroup_tree(char *root_path, const char *state) +static int seize_cgroup_tree(char *root_path, enum freezer_state state) { DIR *dir; struct dirent *de; @@ -109,12 +247,10 @@ static int seize_cgroup_tree(char *root_path, const char *state) * New tasks can appear while a freezer state isn't * frozen, so we need to catch all new tasks. */ - snprintf(path, sizeof(path), "%s/tasks", root_path); - f = fopen(path, "r"); - if (f == NULL) { - pr_perror("Unable to open %s", path); + f = freezer_open_thread_list(root_path); + if (f == NULL) return -1; - } + while (fgets(path, sizeof(path), f)) { pid_t pid; int ret; @@ -134,7 +270,7 @@ static int seize_cgroup_tree(char *root_path, const char *state) if (!compel_interrupt_task(pid)) { pr_debug("SEIZE %d: success\n", pid); processes_to_wait++; - } else if (state == frozen) { + } else if (state == FROZEN) { char buf[] = "/proc/XXXXXXXXXX/exe"; struct stat st; @@ -194,7 +330,7 @@ static int seize_cgroup_tree(char *root_path, const char *state) * A freezer cgroup can contain tasks which will not be dumped * and we need to wait them, because the are interrupted them by ptrace. */ -static int freezer_wait_processes() +static int freezer_wait_processes(void) { int i; @@ -261,12 +397,10 @@ static int log_unfrozen_stacks(char *root) char path[PATH_MAX]; FILE *f; - snprintf(path, sizeof(path), "%s/tasks", root); - f = fopen(path, "r"); - if (f == NULL) { - pr_perror("Unable to open %s", path); + f = freezer_open_thread_list(root); + if (f == NULL) return -1; - } + while (fgets(path, sizeof(path), f)) { pid_t pid; int ret, stack; @@ -331,8 +465,7 @@ static int log_unfrozen_stacks(char *root) static int freeze_processes(void) { int fd, exit_code = -1; - char path[PATH_MAX]; - const char *state = thawed; + enum freezer_state state = THAWED; static const unsigned long step_ms = 100; unsigned long nr_attempts = (opts.timeout * 1000000) / step_ms; @@ -354,23 +487,19 @@ static int freeze_processes(void) pr_debug("freezing processes: %lu attempts with %lu ms steps\n", nr_attempts, step_ms); - snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup); - fd = open(path, O_RDWR); - if (fd < 0) { - pr_perror("Unable to open %s", path); + fd = freezer_open(); + if (fd < 0) return -1; - } + state = get_freezer_state(fd); - if (!state) { + if (state == FREEZER_ERROR) { close(fd); return -1; } - if (state == thawed) { + if (state == THAWED) { freezer_thawed = true; - lseek(fd, 0, SEEK_SET); - if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) { - pr_perror("Unable to freeze tasks"); + if (freezer_write_state(fd, FROZEN)) { close(fd); return -1; } @@ -384,12 +513,12 @@ static int freeze_processes(void) */ for (; i <= nr_attempts; i++) { state = get_freezer_state(fd); - if (!state) { + if (state == FREEZER_ERROR) { close(fd); return -1; } - if (state == frozen) + if (state == FROZEN) break; if (alarm_timeouted()) goto err; @@ -420,13 +549,9 @@ static int freeze_processes(void) } err: - if (exit_code == 0 || freezer_thawed) { - lseek(fd, 0, SEEK_SET); - if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) { - pr_perror("Unable to thaw tasks"); - exit_code = -1; - } - } + if (exit_code == 0 || freezer_thawed) + exit_code = freezer_write_state(fd, THAWED); + if (close(fd)) { pr_perror("Unable to thaw tasks"); return -1; @@ -483,7 +608,7 @@ static int collect_children(struct pstree_item *item) if (!opts.freeze_cgroup) /* fails when meets a zombie */ - compel_interrupt_task(pid); + __ignore_value(compel_interrupt_task(pid)); ret = compel_wait_task(pid, item->pid->real, parse_pid_status, NULL, &creds.s, NULL); if (ret < 0) { @@ -784,6 +909,27 @@ static int collect_task(struct pstree_item *item) return -1; } +static int cgroup_version(void) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup); + if (access(path, F_OK) == 0) { + cgroup_v2 = false; + return 0; + } + + snprintf(path, sizeof(path), "%s/cgroup.freeze", opts.freeze_cgroup); + if (access(path, F_OK) == 0) { + cgroup_v2 = true; + return 0; + } + + pr_err("Neither a cgroupv1 (freezer.state) or cgroupv2 (cgroup.freeze) control file found.\n"); + + return -1; +} + int collect_pstree(void) { pid_t pid = root_item->pid->real; @@ -799,6 +945,11 @@ int collect_pstree(void) */ alarm(opts.timeout); + if (opts.freeze_cgroup && cgroup_version()) + goto err; + + pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); + if (opts.freeze_cgroup && freeze_processes()) goto err; diff --git a/criu/shmem.c b/criu/shmem.c index cee47dba7f..76b537d9ed 100644 --- a/criu/shmem.c +++ b/criu/shmem.c @@ -23,6 +23,7 @@ #include "types.h" #include "page.h" #include "util.h" +#include "memfd.h" #include "protobuf.h" #include "images/pagemap.pb-c.h" @@ -288,7 +289,7 @@ static int open_shmem_sysv(int pid, struct vma_area *vma) VmaEntry *vme = vma->e; struct shmem_info *si; struct shmem_sysv_att *att; - uint64_t ret_fd; + int64_t ret_fd; si = shmem_find(vme->shmid); if (!si) { @@ -490,7 +491,7 @@ static int do_restore_shmem_content(void *addr, unsigned long size, unsigned lon return ret; } -static int restore_shmem_content(void *addr, struct shmem_info *si) +int restore_shmem_content(void *addr, struct shmem_info *si) { return do_restore_shmem_content(addr, si->size, si->shmid); } @@ -500,6 +501,41 @@ int restore_sysv_shmem_content(void *addr, unsigned long size, unsigned long shm return do_restore_shmem_content(addr, round_up(size, PAGE_SIZE), shmid); } +int restore_memfd_shmem_content(int fd, unsigned long shmid, unsigned long size) +{ + void *addr = NULL; + int ret = 1; + + if (size == 0) + return 0; + + if (ftruncate(fd, size) < 0) { + pr_perror("Can't resize shmem 0x%lx size=%ld", shmid, size); + goto out; + } + + addr = mmap(NULL, size, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap shmem 0x%lx size=%ld", shmid, size); + goto out; + } + + /* + * do_restore_shmem_content needs size to be page aligned. + */ + if (do_restore_shmem_content(addr, round_up(size, PAGE_SIZE), shmid) < 0) { + pr_err("Can't restore shmem content\n"); + goto out; + } + + ret = 0; + +out: + if (addr) + munmap(addr, size); + return ret; +} + static int open_shmem(int pid, struct vma_area *vma) { VmaEntry *vi = vma->e; @@ -532,7 +568,7 @@ static int open_shmem(int pid, struct vma_area *vma) flags = MAP_SHARED; if (kdat.has_memfd) { - f = syscall(SYS_memfd_create, "", 0); + f = memfd_create("", 0); if (f < 0) { pr_perror("Unable to create memfd"); goto err; @@ -779,6 +815,32 @@ static int dump_one_shmem(struct shmem_info *si) return ret; } +int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size) +{ + int ret = -1; + void *addr; + struct shmem_info si; + + if (size == 0) + return 0; + + memset(&si, 0, sizeof(si)); + si.shmid = shmid; + si.size = size; + + addr = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) { + pr_perror("Can't mmap shmem 0x%lx", shmid); + goto err; + } + + ret = do_dump_one_shmem(fd, addr, &si); + + munmap(addr, size); +err: + return ret; +} + int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid) { int fd, ret; diff --git a/criu/sk-inet.c b/criu/sk-inet.c index f9c64c7af5..3425485851 100644 --- a/criu/sk-inet.c +++ b/criu/sk-inet.c @@ -551,7 +551,7 @@ static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int fa switch (proto) { case IPPROTO_TCP: - err = (type != SOCK_RAW) ? dump_one_tcp(lfd, sk) : 0; + err = (type != SOCK_RAW) ? dump_one_tcp(lfd, sk, &skopts) : 0; break; case IPPROTO_UDP: case IPPROTO_UDPLITE: @@ -747,6 +747,10 @@ static int post_open_inet_sk(struct file_desc *d, int sk) if (!val && restore_opt(sk, SOL_SOCKET, SO_BROADCAST, &val)) return -1; + val = ii->ie->opts->so_keepalive; + if (!val && restore_opt(sk, SOL_SOCKET, SO_KEEPALIVE, &val)) + return -1; + return 0; } diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c index 4fd2eb8e60..7ee6038186 100644 --- a/criu/sk-tcp.c +++ b/criu/sk-tcp.c @@ -218,8 +218,26 @@ static int dump_tcp_conn_state(struct inet_sk_desc *sk) return ret; } -int dump_one_tcp(int fd, struct inet_sk_desc *sk) +int dump_one_tcp(int fd, struct inet_sk_desc *sk, SkOptsEntry *soe) { + soe->has_tcp_keepcnt = true; + if (dump_opt(fd, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt)) { + pr_perror("Can't read TCP_KEEPCNT"); + return -1; + } + + soe->has_tcp_keepidle = true; + if (dump_opt(fd, SOL_TCP, TCP_KEEPIDLE, &soe->tcp_keepidle)) { + pr_perror("Can't read TCP_KEEPIDLE"); + return -1; + } + + soe->has_tcp_keepintvl = true; + if (dump_opt(fd, SOL_TCP, TCP_KEEPINTVL, &soe->tcp_keepintvl)) { + pr_perror("Can't read TCP_KEEPINTVL"); + return -1; + } + if (sk->dst_port == 0) return 0; diff --git a/criu/sk-unix.c b/criu/sk-unix.c index f0620e6761..c9f30abd42 100644 --- a/criu/sk-unix.c +++ b/criu/sk-unix.c @@ -54,11 +54,6 @@ * as "external" and require the --ext-unix-sk option. */ -#define USK_EXTERN (1 << 0) -#define USK_SERVICE (1 << 1) -#define USK_CALLBACK (1 << 2) -#define USK_INHERIT (1 << 3) - #define FAKE_INO 0 struct unix_sk_desc { @@ -79,6 +74,9 @@ struct unix_sk_desc { unsigned char shutdown; bool deleted; + bool bindmount; + unsigned int mnt_id; + mode_t mode; uid_t uid; gid_t gid; @@ -100,6 +98,7 @@ static mutex_t *mutex_ghost; static LIST_HEAD(unix_sockets); static LIST_HEAD(unix_ghost_addr); +static LIST_HEAD(unix_mnt_sockets); static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, UnixSkEntry *ue, const struct fd_parms *p); @@ -130,7 +129,7 @@ static struct unix_sk_listen_icon *lookup_unix_listen_icons(unsigned int peer_in static void show_one_unix(char *act, const struct unix_sk_desc *sk) { - pr_debug("\t%s: ino %d peer_ino %d family %4d type %4d state %2d name %s\n", + pr_debug("\t%s: ino %u peer_ino %u family %4d type %4d state %2d name %s\n", act, sk->sd.ino, sk->peer_ino, sk->sd.family, sk->type, sk->state, sk->name); if (sk->nr_icons) { @@ -143,7 +142,7 @@ static void show_one_unix(char *act, const struct unix_sk_desc *sk) static void show_one_unix_img(const char *act, const UnixSkEntry *e) { - pr_info("\t%s: id %#x ino %d peer %d type %d state %d name %d bytes\n", + pr_info("\t%s: id %#x ino %u peer %u type %d state %d name %d bytes\n", act, e->id, e->ino, e->peer, e->type, e->state, (int)e->name.len); } @@ -386,6 +385,9 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) if (unix_resolve_name(lfd, id, sk, ue, p)) goto err; + if (sk->bindmount) + ue->uflags |= UNIX_UFLAGS__BINDMOUNT; + /* * Check if this socket is connected to criu service. * Dump it like closed one and mark it for restore. @@ -393,7 +395,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) if (unlikely(ue->peer == service_sk_ino)) { ue->state = TCP_CLOSE; ue->peer = 0; - ue->uflags |= USK_SERVICE; + ue->uflags |= UNIX_UFLAGS__SERVICE; } if (sk->namelen && *sk->name) { @@ -426,7 +428,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) if (ue->peer) { peer = (struct unix_sk_desc *)lookup_socket(ue->peer, PF_UNIX, 0); if (IS_ERR_OR_NULL(peer)) { - pr_err("Unix socket %d without peer %d\n", + pr_err("Unix socket %u without peer %u\n", ue->ino, ue->peer); goto err; } @@ -437,7 +439,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) */ if (peer->peer_ino != ue->ino) { if (!peer->name) { - pr_err("Unix socket %d with unreachable peer %d (%d)\n", + pr_err("Unix socket %u with unreachable peer %u (%u)\n", ue->ino, ue->peer, peer->peer_ino); goto err; } @@ -513,7 +515,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) ue->peer = e->sk_desc->sd.ino; - pr_debug("\t\tFixed inflight socket %d peer %d)\n", + pr_debug("\t\tFixed inflight socket %u peer %u)\n", ue->ino, ue->peer); } dump: @@ -528,7 +530,7 @@ static int dump_one_unix_fd(int lfd, uint32_t id, const struct fd_parms *p) * Postpone writing the entry if a peer isn't found yet. * It's required, because we may need to modify the entry. * For example, if a socket is external and is dumped by - * a callback, the USK_CALLBACK flag must be set. + * a callback, the UNIX_UFLAGS__CALLBACK flag must be set. */ if (list_empty(&sk->peer_node) && write_unix_entry(sk)) return -1; @@ -573,11 +575,16 @@ static int unix_resolve_name(int lfd, uint32_t id, struct unix_sk_desc *d, if (d->namelen == 0 || name[0] == '\0') return 0; - if (kdat.sk_unix_file && (root_ns_mask & CLONE_NEWNS)) { - if (get_mnt_id(lfd, &mnt_id)) - return -1; - ue->mnt_id = mnt_id; - ue->has_mnt_id = mnt_id; + if (!d->bindmount) { + if (kdat.sk_unix_file && (root_ns_mask & CLONE_NEWNS)) { + if (get_mnt_id(lfd, &mnt_id)) + return -1; + ue->mnt_id = mnt_id; + ue->has_mnt_id = mnt_id; + } + } else { + ue->mnt_id = d->mnt_id; + ue->has_mnt_id = true; } if (ue->mnt_id >= 0) @@ -698,6 +705,7 @@ static int unix_collect_one(const struct unix_diag_msg *m, INIT_LIST_HEAD(&d->peer_list); INIT_LIST_HEAD(&d->peer_node); d->fd = -1; + d->mnt_id = -1; if (tb[UNIX_DIAG_SHUTDOWN]) d->shutdown = nla_get_u8(tb[UNIX_DIAG_SHUTDOWN]); @@ -793,7 +801,7 @@ static int __dump_external_socket(struct unix_sk_desc *sk, return -1; if (ret == 0) { - sk->ue->uflags |= USK_CALLBACK; + sk->ue->uflags |= UNIX_UFLAGS__CALLBACK; return 0; } @@ -870,7 +878,7 @@ int fix_external_unix_sockets(void) e.state = TCP_LISTEN; e.name.data = (void *)sk->name; e.name.len = (size_t)sk->namelen; - e.uflags = USK_EXTERN; + e.uflags = UNIX_UFLAGS__EXTERN; e.peer = 0; e.fown = &fown; e.opts = &skopts; @@ -893,13 +901,86 @@ int fix_external_unix_sockets(void) return -1; } +int collect_unix_bindmounts(void) +{ + struct mount_info *mi; + struct stat st = {}; + int ns_old = -1; + int ret = 0; + + pr_debug("Collecting unix bindmounts\n"); + + for (mi = mntinfo; mi; mi = mi->next) { + if (list_empty(&mi->mnt_bind)) + continue; + + if (switch_ns(mi->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0) { + pr_err("Can't switch ns to mnt_id %d", mi->mnt_id); + if (restore_ns(ns_old, &mnt_ns_desc)) { + pr_err("Can't switch mount ns back from mnt_id %d\n", mi->mnt_id); + return -1; + } + return -1; + } + + if (stat(mi->mountpoint, &st)) { + pr_warn("Can't stat on %s: %m\n", mi->mountpoint); + if (restore_ns(ns_old, &mnt_ns_desc)) { + pr_err("Can't switch mount ns back from mnt_id %d\n", mi->mnt_id); + return -1; + } + continue; + } + + if (S_ISSOCK(st.st_mode)) { + struct unix_sk_desc *sk; + + list_for_each_entry(sk, &unix_sockets, list) { + if (sk->vfs_ino == (int)st.st_ino && + sk->vfs_dev == (int)st.st_dev) { + pr_debug("Found sock s_dev %#x ino %d bindmounted mnt_id %d %s\n", + (int)st.st_dev, (int)st.st_ino, mi->mnt_id, mi->mountpoint); + if (sk->bindmount) { + pr_err("Many bindings for sockets are not yet supported %d at %s\n", + (int)st.st_ino, mi->mountpoint); + ret = -1; + } else { + sk->mnt_id = mi->mnt_id; + sk->bindmount = true; + } + if (sk->type != SOCK_DGRAM && sk->state != TCP_CLOSE) { + pr_err("Unsupported bindmounted socket ino %d at %s\n", + (int)st.st_ino, mi->mountpoint); + ret = -1; + } + break; + } + } + } + + if (restore_ns(ns_old, &mnt_ns_desc)) { + pr_err("Can't switch mount ns back from %d\n", mi->nsid->ns_pid); + return -1; + } + + if (ret) + break; + } + + return ret; +} + struct unix_sk_info { UnixSkEntry *ue; struct list_head list; + struct list_head mnt_list; char *name; char *name_dir; unsigned flags; - int fdstore_id; + union { + int fdstore_id; + int fdstore_mnt_id[2]; + }; struct unix_sk_info *peer; struct pprep_head peer_resolve; /* XXX : union with the above? */ struct file_desc d; @@ -933,6 +1014,8 @@ struct scm_fle { #define USK_PAIR_MASTER 0x1 #define USK_PAIR_SLAVE 0x2 #define USK_GHOST_FDSTORE 0x4 /* bound but removed address */ +#define USK_BINDMOUNT 0x8 /* socket is pre-openeded for bindmount reason */ +#define USK_NOCWD 0x10 /* no cwd switch */ static struct unix_sk_info *find_unix_sk_by_ino(int ino) { @@ -1102,10 +1185,13 @@ static bool peer_is_not_prepared(struct unix_sk_info *peer) return (!peer->listen); } -static int restore_unix_queue(int fd, struct unix_sk_info *peer) +static int restore_unix_queue(int fd, SkOptsEntry *soe, struct unix_sk_info *peer) { struct pstree_item *task; + if (restore_socket_bufsz(fd, soe)) + return -1; + if (restore_sk_queue(fd, peer->ue->id)) return -1; if (peer->queuer) @@ -1152,6 +1238,9 @@ static int revert_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd, int *ro { int ret = 0; + if (ui->flags & USK_NOCWD) + return 0; + if (*ns_fd >= 0 && restore_ns(*ns_fd, &mnt_ns_desc)) ret = -1; if (*root_fd >= 0) { @@ -1179,6 +1268,26 @@ static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd, static struct ns_id *root = NULL, *ns; int fd; + if (ui->flags & USK_NOCWD) + return 0; + + /* + * To change mount namespace we should have fs->user = 1 + * (see fs/namespace.c:mntns_install) but this is not + * usually possible since main criu process already may + * has forked() with CLONE_FS | CLONE_FILES and fs->user + * is a way bigger. + * + * For now simply switch to old scheme where all sockets + * are restored in root mount namespace. + * + * FIXME: Need to revisit later. + */ + if (prev_mntns_fd && ui->name[0] && ui->ue->mnt_id >= 0) { + *prev_mntns_fd = -1; + prev_mntns_fd = NULL; + } + if (prev_mntns_fd && ui->name[0] && ui->ue->mnt_id >= 0) { struct ns_id *mntns = lookup_nsid_by_mnt_id(ui->ue->mnt_id); int ns_fd; @@ -1192,8 +1301,10 @@ static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd, if (ns_fd < 0) return -1; - if (switch_ns_by_fd(ns_fd, &mnt_ns_desc, prev_mntns_fd)) + if (switch_ns_by_fd(ns_fd, &mnt_ns_desc, prev_mntns_fd)) { + close(ns_fd); return -1; + } set_proc_self_fd(-1); close(ns_fd); @@ -1266,7 +1377,7 @@ static int post_open_standalone(struct file_desc *d, int fd) ui = container_of(d, struct unix_sk_info, d); BUG_ON((ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE)) || - (ui->ue->uflags & (USK_CALLBACK | USK_INHERIT))); + (ui->ue->uflags & (UNIX_UFLAGS__CALLBACK | UNIX_UFLAGS__INHERIT))); if (chk_restored_scms(ui)) return 1; @@ -1334,8 +1445,8 @@ static int post_open_standalone(struct file_desc *d, int fd) restore_queue: if (peer->queuer == ui && - !(peer->ue->uflags & USK_EXTERN) && - restore_unix_queue(fd, peer)) + !(peer->ue->uflags & UNIX_UFLAGS__EXTERN) && + restore_unix_queue(fd, ui->ue->opts, peer)) return -1; restore_sk_common: if (ui->queuer && !ui->queuer->peer_queue_restored) @@ -1383,7 +1494,7 @@ static int keep_deleted(struct unix_sk_info *ui) { int fd = open(ui->name, O_PATH); if (fd < 0) { - pr_perror("ghost: Can't open id %#x ino %d addr %s", + pr_perror("ghost: Can't open id %#x ino %u addr %s", ui->ue->id, ui->ue->ino, ui->name); return -1; } @@ -1409,7 +1520,7 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) int ret; if (ui->ue->name.len >= UNIX_PATH_MAX) { - pr_err("ghost: Too long name for socket id %#x ino %d name %s\n", + pr_err("ghost: Too long name for socket id %#x ino %u name %s\n", ui->ue->id, ui->ue->ino, ui->name); return -ENOSPC; } @@ -1424,14 +1535,14 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) ret = access(path, R_OK | W_OK | X_OK); if (ret == 0) { ui->ghost_dir_pos = pos - path; - pr_debug("ghost: socket id %#x ino %d name %s detected F_OK %s\n", + pr_debug("ghost: socket id %#x ino %u name %s detected F_OK %s\n", ui->ue->id, ui->ue->ino, ui->name, path); break; } if (errno != ENOENT) { ret = -errno; - pr_perror("ghost: Can't access %s for socket id %#x ino %d name %s", + pr_perror("ghost: Can't access %s for socket id %#x ino %u name %s", path, ui->ue->id, ui->ue->ino, ui->name); return ret; } @@ -1441,7 +1552,7 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) path[ui->ue->name.len] = '\0'; pos = dirname(path); - pr_debug("ghost: socket id %#x ino %d name %s creating %s\n", + pr_debug("ghost: socket id %#x ino %u name %s creating %s\n", ui->ue->id, ui->ue->ino, ui->name, pos); ret = mkdirpat(AT_FDCWD, pos, 0755); if (ret) { @@ -1471,15 +1582,15 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) * clean it up. */ if (unlinkat(AT_FDCWD, path_parked, 0) == 0) - pr_debug("ghost: Unlinked stale socket id %#x ino %d name %s\n", + pr_debug("ghost: Unlinked stale socket id %#x ino %u name %s\n", ui->ue->id, ui->ue->ino, path_parked); if (rename(ui->name, path_parked)) { ret = -errno; - pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s", + pr_perror("ghost: Can't rename id %#x ino %u addr %s -> %s", ui->ue->id, ui->ue->ino, ui->name, path_parked); return ret; } - pr_debug("ghost: id %#x ino %d renamed %s -> %s\n", + pr_debug("ghost: id %#x ino %u renamed %s -> %s\n", ui->ue->id, ui->ue->ino, ui->name, path_parked); renamed = true; ret = bind(sk, (struct sockaddr *)&addr, @@ -1487,7 +1598,7 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) } if (ret < 0) { ret = -errno; - pr_perror("ghost: Can't bind on socket id %#x ino %d addr %s", + pr_perror("ghost: Can't bind on socket id %#x ino %u addr %s", ui->ue->id, ui->ue->ino, ui->name); return ret; } @@ -1499,7 +1610,7 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) ret = keep_deleted(ui); if (ret < 0) { - pr_err("ghost: Can't save socket %#x ino %d addr %s into fdstore\n", + pr_err("ghost: Can't save socket %#x ino %u addr %s into fdstore\n", ui->ue->id, ui->ue->ino, ui->name); return -EIO; } @@ -1511,7 +1622,7 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) ret = unlinkat(AT_FDCWD, ui->name, 0); if (ret < 0) { ret = -errno; - pr_perror("ghost: Can't unlink socket %#x ino %d addr %s", + pr_perror("ghost: Can't unlink socket %#x ino %u addr %s", ui->ue->id, ui->ue->ino, ui->name); return ret; } @@ -1519,12 +1630,12 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) if (renamed) { if (rename(path_parked, ui->name)) { ret = -errno; - pr_perror("ghost: Can't rename id %#x ino %d addr %s -> %s", + pr_perror("ghost: Can't rename id %#x ino %u addr %s -> %s", ui->ue->id, ui->ue->ino, path_parked, ui->name); return ret; } - pr_debug("ghost: id %#x ino %d renamed %s -> %s\n", + pr_debug("ghost: id %#x ino %u renamed %s -> %s\n", ui->ue->id, ui->ue->ino, path_parked, ui->name); } @@ -1542,11 +1653,11 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) pos = strrchr(path, '/')) { *pos = '\0'; if (rmdir(path)) { - pr_perror("ghost: Can't remove directory %s on id %#x ino %d", + pr_perror("ghost: Can't remove directory %s on id %#x ino %u", path, ui->ue->id, ui->ue->ino); return -1; } - pr_debug("ghost: Removed %s on id %#x ino %d\n", + pr_debug("ghost: Removed %s on id %#x ino %u\n", path, ui->ue->id, ui->ue->ino); } } @@ -1554,7 +1665,7 @@ static int bind_on_deleted(int sk, struct unix_sk_info *ui) return 0; } -static int bind_unix_sk(int sk, struct unix_sk_info *ui) +static int bind_unix_sk(int sk, struct unix_sk_info *ui, bool notify) { struct sockaddr_un addr; int cwd_fd = -1, root_fd = -1, ns_fd = -1; @@ -1594,13 +1705,13 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui) mutex_lock(mutex_ghost); if (ui->flags & USK_GHOST_FDSTORE) { - pr_debug("ghost: bind id %#x ino %d addr %s\n", + pr_debug("ghost: bind id %#x ino %u addr %s\n", ui->ue->id, ui->ue->ino, ui->name); ret = bind_on_deleted(sk, ui); if (ret) errno = -ret; } else { - pr_debug("bind id %#x ino %d addr %s\n", + pr_debug("bind id %#x ino %u addr %s\n", ui->ue->id, ui->ue->ino, ui->name); ret = bind(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family) + ui->ue->name.len); @@ -1608,13 +1719,13 @@ static int bind_unix_sk(int sk, struct unix_sk_info *ui) goto done; } if (ret < 0) { - pr_perror("Can't bind id %#x ino %d addr %s", + pr_perror("Can't bind id %#x ino %u addr %s", ui->ue->id, ui->ue->ino, ui->name); goto done; } - if (ui->ue->state != TCP_LISTEN) { - ui->bound = 1; + if (notify && ui->ue->state != TCP_LISTEN) { + ui->bound = true; wake_connected_sockets(ui); } @@ -1637,10 +1748,10 @@ static int post_open_interconnected_master(struct unix_sk_info *ui) if (chk_restored_scms(ui) || chk_restored_scms(peer)) return 0; - if (restore_unix_queue(fle->fe->fd, peer)) + if (restore_unix_queue(fle->fe->fd, ui->ue->opts, peer)) return -1; - if (restore_unix_queue(fle_peer->fe->fd, ui)) + if (restore_unix_queue(fle_peer->fe->fd, peer->ue->opts, ui)) return -1; if (restore_sk_common(fle->fe->fd, ui)) @@ -1654,7 +1765,7 @@ static int post_open_interconnected_master(struct unix_sk_info *ui) static void pr_info_opening(const char *prefix, struct unix_sk_info *ui, struct fdinfo_list_entry *fle) { - pr_info("Opening %s (stage %d id %#x ino %d peer %d)\n", + pr_info("Opening %s (stage %d id %#x ino %u peer %u)\n", prefix, fle->stage, ui->ue->id, ui->ue->ino, ui->ue->peer); } @@ -1701,10 +1812,10 @@ static int open_unixsk_pair_master(struct unix_sk_info *ui, int *new_fd) } sk[1] = fle_peer->fe->fd; - if (bind_unix_sk(sk[0], ui)) + if (bind_unix_sk(sk[0], ui, true)) return -1; - if (bind_unix_sk(sk[1], peer)) + if (bind_unix_sk(sk[1], peer, true)) return -1; *new_fd = sk[0]; @@ -1754,12 +1865,77 @@ static int setup_second_end(int *sks, struct fdinfo_list_entry *second_end) return 0; } +static int break_connected(struct unix_sk_info *ui, int sk) +{ + if (ui->ue->type == SOCK_DGRAM) { + struct sockaddr_un addr = { .sun_family = AF_UNSPEC }; + pr_debug("Break connected id %#x ino %d\n", + ui->ue->id, ui->ue->ino); + /* + * socketpair() assigns sks[1] as a peer of sks[0] + * (and vice versa). But in this case (not zero peer) + * it's impossible for other sockets to connect + * to sks[0] (see unix_dgram_connect()->unix_may_send()). + * The below is hack: we use that connect with AF_UNSPEC + * clears socket's peer. + * Note, that connect hack flushes receive queue, + * so restore_unix_queue() must be after it. + */ + if (connect(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family))) { + pr_perror("Can't clear socket id %#x peer", ui->ue->id); + return -1; + } + } + return 0; +} + +static int make_socket(struct unix_sk_info *ui, int sks[2], bool pair, bool disjoin_master) +{ + if (unlikely(ui->flags & USK_BINDMOUNT)) { + sks[0] = fdstore_get(ui->fdstore_mnt_id[0]); + sks[1] = fdstore_get(ui->fdstore_mnt_id[1]); + pr_debug("bindmount: Fetch socket pair id %#x ino %d\n", + ui->ue->id, ui->ue->ino); + if (sks[0] < 0 || sks[1] < 0) { + pr_err("bindmount: Can't fetch id %#x socketpair from the store\n", + ui->ue->id); + return -1; + } + } else { + int ret; + + sks[0] = sks[1] = -1; + if (!pair) { + pr_debug("Create socket id %#x ino %d\n", + ui->ue->id, ui->ue->ino); + ret = socket(PF_UNIX, ui->ue->type, 0); + sks[0] = ret; + } else { + pr_debug("Create socket pair id %#x ino %d\n", + ui->ue->id, ui->ue->ino); + ret = socketpair(PF_UNIX, ui->ue->type, 0, sks); + } + + if (ret < 0) { + pr_perror("Can't create %s id %#x\n", + pair ? "socketpair" : "socket", + ui->ue->id); + return -1; + } + } + + if (disjoin_master) + return break_connected(ui, sks[0]); + + return 0; +} + static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) { struct unix_sk_info *queuer = ui->queuer; struct unix_sk_info *peer = ui->peer; struct fdinfo_list_entry *fle, *fle_peer; - int sk; + int sks[2]; fle = file_master(&ui->d); pr_info_opening("standalone", ui, fle); @@ -1782,7 +1958,7 @@ static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) return post_open_standalone(&ui->d, fle->fe->fd); /* Fake socket will be restored by its peer */ - if (!(ui->ue->uflags & USK_EXTERN) && ui->ue->ino == FAKE_INO) + if (!(ui->ue->uflags & UNIX_UFLAGS__EXTERN) && ui->ue->ino == FAKE_INO) return 1; if (set_netns(ui->ue->ns_id)) @@ -1793,22 +1969,15 @@ static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) * If so, put response, that dumping and restoring * was successful. */ - if (ui->ue->uflags & USK_SERVICE) { - int sks[2]; - - if (socketpair(PF_UNIX, ui->ue->type, 0, sks)) { - pr_perror("Can't create socketpair"); + if (ui->ue->uflags & UNIX_UFLAGS__SERVICE) { + if (make_socket(ui, sks, true, false)) return -1; - } if (send_criu_dump_resp(sks[1], true, true) == -1) return -1; close(sks[1]); - sk = sks[0]; } else if (ui->ue->state == TCP_ESTABLISHED && queuer && queuer->ue->ino == FAKE_INO) { - int ret, sks[2]; - if (ui->ue->type != SOCK_STREAM) { pr_err("Non-stream socket %d in established state\n", ui->ue->ino); @@ -1821,51 +1990,21 @@ static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) return -1; } - ret = socketpair(PF_UNIX, ui->ue->type, 0, sks); - if (ret < 0) { - pr_perror("Can't create socketpair"); + if (make_socket(ui, sks, true, false)) return -1; - } if (setup_second_end(sks, file_master(&queuer->d))) return -1; - - sk = sks[0]; } else if (ui->ue->type == SOCK_DGRAM && queuer && queuer->ue->ino == FAKE_INO) { - struct sockaddr_un addr; - int sks[2]; - - if (socketpair(PF_UNIX, ui->ue->type, 0, sks) < 0) { - pr_perror("Can't create socketpair"); + if (make_socket(ui, sks, true, true)) return -1; - } - - sk = sks[0]; - addr.sun_family = AF_UNSPEC; - - /* - * socketpair() assigns sks[1] as a peer of sks[0] - * (and vice versa). But in this case (not zero peer) - * it's impossible for other sockets to connect - * to sks[0] (see unix_dgram_connect()->unix_may_send()). - * The below is hack: we use that connect with AF_UNSPEC - * clears socket's peer. - * Note, that connect hack flushes receive queue, - * so restore_unix_queue() must be after it. - */ - if (connect(sk, (struct sockaddr *)&addr, sizeof(addr.sun_family))) { - pr_perror("Can't clear socket's peer"); - return -1; - } if (setup_second_end(sks, file_master(&queuer->d))) return -1; - - sk = sks[0]; } else { - if (ui->ue->uflags & USK_CALLBACK) { - sk = run_plugins(RESTORE_UNIX_SK, ui->ue->ino); - if (sk >= 0) + if (ui->ue->uflags & UNIX_UFLAGS__CALLBACK) { + sks[0] = run_plugins(RESTORE_UNIX_SK, ui->ue->ino); + if (sks[0] >= 0) goto out; } @@ -1873,31 +2012,35 @@ static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) * Connect to external sockets requires * special option to be passed. */ - if (ui->peer && (ui->peer->ue->uflags & USK_EXTERN) && + if (ui->peer && (ui->peer->ue->uflags & UNIX_UFLAGS__EXTERN) && !(opts.ext_unix_sk)) { pr_err("External socket found in image. " "Consider using the --" USK_EXT_PARAM - "option to allow restoring it.\n"); + " option to allow restoring it.\n"); return -1; } - sk = socket(PF_UNIX, ui->ue->type, 0); - if (sk < 0) { - pr_perror("Can't make unix socket"); + pr_debug("socketpair instead of plain socket\n"); + if (make_socket(ui, sks, false, true)) return -1; - } + close(sks[1]); } - if (bind_unix_sk(sk, ui)) { - close(sk); - return -1; + if (!(ui->ue->uflags & UNIX_UFLAGS__BINDMOUNT)) { + if (bind_unix_sk(sks[0], ui, true)) { + close(sks[0]); + return -1; + } + } else { + ui->bound = true; + wake_connected_sockets(ui); } if (ui->ue->state == TCP_LISTEN) { pr_info("\tPutting %d into listen state\n", ui->ue->ino); - if (listen(sk, ui->ue->backlog) < 0) { + if (listen(sks[0], ui->ue->backlog) < 0) { pr_perror("Can't make usk listen"); - close(sk); + close(sks[0]); return -1; } ui->listen = 1; @@ -1912,15 +2055,15 @@ static int open_unixsk_standalone(struct unix_sk_info *ui, int *new_fd) * 2)Queuer won't be able to connect, if we do * shutdown, so postpone it. */ - *new_fd = sk; + *new_fd = sks[0]; return 1; } out: - if (restore_sk_common(sk, ui)) + if (restore_sk_common(sks[0], ui)) return -1; - *new_fd = sk; + *new_fd = sks[0]; return 0; } @@ -1932,7 +2075,7 @@ static int open_unix_sk(struct file_desc *d, int *new_fd) ui = container_of(d, struct unix_sk_info, d); if (inherited_fd(d, new_fd)) { - ui->ue->uflags |= USK_INHERIT; + ui->ue->uflags |= UNIX_UFLAGS__INHERIT; ret = *new_fd >= 0 ? 0 : -1; } else if (ui->flags & USK_PAIR_MASTER) ret = open_unixsk_pair_master(ui, new_fd); @@ -1950,7 +2093,7 @@ static char *socket_d_name(struct file_desc *d, char *buf, size_t s) ui = container_of(d, struct unix_sk_info, d); - if (snprintf(buf, s, "socket:[%d]", ui->ue->ino) >= s) { + if (snprintf(buf, s, "socket:[%u]", ui->ue->ino) >= s) { pr_err("Not enough room for unixsk %d identifier string\n", ui->ue->ino); return NULL; @@ -1973,7 +2116,9 @@ static int unlink_sk(struct unix_sk_info *ui) { int ret = 0, cwd_fd = -1, root_fd = -1, ns_fd = -1; - if (!ui->name || ui->name[0] == '\0' || (ui->ue->uflags & USK_EXTERN)) + if (!ui->name || ui->name[0] == '\0' || + (ui->flags & USK_BINDMOUNT) || + (ui->ue->uflags & UNIX_UFLAGS__EXTERN)) return 0; if (prep_unix_sk_cwd(ui, &cwd_fd, &root_fd, NULL)) @@ -1981,14 +2126,14 @@ static int unlink_sk(struct unix_sk_info *ui) ret = unlinkat(AT_FDCWD, ui->name, 0) ? -1 : 0; if (ret < 0 && errno != ENOENT) { - pr_warn("Can't unlink socket %d peer %d (name %s dir %s)\n", + pr_warn("Can't unlink socket %u peer %u (name %s dir %s)\n", ui->ue->ino, ui->ue->peer, ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-", ui->name_dir ? ui->name_dir : "-"); ret = -errno; goto out; } else if (ret == 0) { - pr_debug("Unlinked socket %d peer %d (name %s dir %s)\n", + pr_debug("Unlinked socket %u peer %u (name %s dir %s)\n", ui->ue->ino, ui->ue->peer, ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-", ui->name_dir ? ui->name_dir : "-"); @@ -2027,7 +2172,8 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue) ui->name_dir = (void *)ue->name_dir; ui->flags = 0; - ui->fdstore_id = -1; + ui->fdstore_mnt_id[0] = -1; /* fdstore_id in union */ + ui->fdstore_mnt_id[1] = -1; ui->ghost_dir_pos = 0; ui->peer = NULL; ui->queuer = NULL; @@ -2040,6 +2186,7 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue) memzero(&ui->d, sizeof(ui->d)); INIT_LIST_HEAD(&ui->list); + INIT_LIST_HEAD(&ui->mnt_list); INIT_LIST_HEAD(&ui->connected); INIT_LIST_HEAD(&ui->node); INIT_LIST_HEAD(&ui->scm_fles); @@ -2048,16 +2195,20 @@ static int init_unix_sk_info(struct unix_sk_info *ui, UnixSkEntry *ue) return 0; } -int unix_prepare_root_shared(void) +int unix_prepare_shared(void) { - struct unix_sk_info *ui; - mutex_ghost = shmalloc(sizeof(*mutex_ghost)); if (!mutex_ghost) { pr_err("ghost: Can't allocate mutex\n"); return -ENOMEM; } mutex_init(mutex_ghost); + return 0; +} + +int unix_prepare_root_shared(void) +{ + struct unix_sk_info *ui; pr_debug("ghost: Resolving addresses\n"); @@ -2065,7 +2216,7 @@ int unix_prepare_root_shared(void) char tp_name[32]; char st_name[32]; - pr_debug("ghost: id %#x type %s state %s ino %d peer %d address %s\n", + pr_debug("ghost: id %#x type %s state %s ino %u peer %u address %s\n", ui->ue->id, __socket_type_name(ui->ue->type, tp_name), __tcp_state_name(ui->ue->state, st_name), ui->ue->ino, ui->peer ? ui->peer->ue->ino : 0, @@ -2113,7 +2264,7 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i) uname = "-"; } - pr_info(" `- Got id %#x ino %d type %s state %s peer %d (name %s%.*s dir %s)\n", + pr_info(" `- Got id %#x ino %u type %s state %s peer %u (name %s%.*s dir %s)\n", ui->ue->id, ui->ue->ino, ___socket_type_name(ui->ue->type), ___tcp_state_name(ui->ue->state), ui->ue->peer, prefix, ulen, uname, ui->name_dir ? ui->name_dir : "-"); @@ -2128,13 +2279,32 @@ static int collect_one_unixsk(void *o, ProtobufCMessage *base, struct cr_img *i) if (ui->ue->deleted) { if (!ui->name || !ui->ue->name.len || !ui->name[0]) { - pr_err("No name present, ino %d\n", ui->ue->ino); + pr_err("No name present, ino %u\n", ui->ue->ino); return -1; } list_add_tail(&ui->ghost_node, &unix_ghost_addr); } + if (ui->ue->uflags & UNIX_UFLAGS__BINDMOUNT) { + /* + * Make sure it is supported socket! + */ + if ((ui->ue->uflags & ~UNIX_UFLAGS__BINDMOUNT) || + (ui->ue->type != SOCK_DGRAM) || + (ui->ue->state != TCP_CLOSE)) { + pr_err("bindmount: Unsupported socket id %#x " + "(expect %x:%s:%s got %x:%s:%s)\n", + ui->ue->id, UNIX_UFLAGS__BINDMOUNT, + ___socket_type_name(SOCK_DGRAM), + ___tcp_state_name(TCP_CLOSE), + ui->ue->uflags, ___socket_type_name(ui->ue->type), + ___tcp_state_name(ui->ue->state)); + return -1; + } + list_add_tail(&ui->mnt_list, &unix_mnt_sockets); + } + list_add_tail(&ui->list, &unix_sockets); return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops); } @@ -2147,6 +2317,126 @@ struct collect_image_info unix_sk_cinfo = { .flags = COLLECT_SHARED, }; +int unix_prepare_bindmount(struct mount_info *mi) +{ + int prev_cwd_fd = -1, prev_root_fd = -1; + int ret = -1, sks[2] = { -1, -1 }; + struct unix_sk_info *ui; + char path[PATH_MAX]; + + list_for_each_entry(ui, &unix_mnt_sockets, mnt_list) { + if (ui->ue->mnt_id == mi->mnt_id) { + char type_name[64], state_name[64]; + pr_info("bindmount: id %#x ino %d type %s state %s (queuer id %#x ino %d) peer %d (name %.*s dir %s)\n", + ui->ue->id, ui->ue->ino, + __socket_type_name(ui->ue->type, type_name), + __tcp_state_name(ui->ue->state, state_name), + ui->queuer ? ui->queuer->ue->id : -1, + ui->queuer ? ui->queuer->ue->ino : -1, + ui->ue->peer, (int)ui->ue->name.len, + ui->ue->name.data, ui->name_dir ? ui->name_dir : "-"); + break; + } + } + + if (&ui->mnt_list == &unix_mnt_sockets) + return 0; + + /* + * Mark it as bindmount so when need to use we + * would fetch it from the fdstore, and point + * out that no need to cwd change since we + * already opened it in proper place. + */ + ui->flags |= USK_BINDMOUNT | USK_NOCWD; + + if (rst_get_mnt_root(mi->mnt_id, path, sizeof(path)) < 0) { + pr_err("bindmount: Can't setup mnt_root for %s\n", mi->ns_mountpoint); + return -1; + } + + prev_cwd_fd = open(".", O_RDONLY); + if (prev_cwd_fd < 0) { + pr_perror("bindmount: Can't save current cwd"); + goto out; + } + + prev_root_fd = open("/", O_RDONLY); + if (prev_root_fd < 0) { + pr_perror("bindmount: Can't save current root"); + goto out; + } + + if (chdir(path)) { + pr_perror("bindmount: Can't chdir to %s", path); + goto out; + } else if (chroot(".")) { + pr_perror("bindmount: Can't chroot"); + goto out; + } + + if (ui->name_dir && chdir(ui->name_dir)) { + pr_perror("bindmount: Can't chdir to %s", ui->name_dir); + goto out; + } + + if (set_netns(ui->ue->ns_id)) + return -1; + + /* + * We support only DGRAM sockets for now so it is safe + * to preallocate socket pair here and later the + * open_unixsk_standalone helper will simply fetch the + * peers, closing the ends it doesn't need. + */ + if (socketpair(PF_UNIX, ui->ue->type, 0, sks)) { + pr_perror("bindmount: Can't create socketpair id %#x", + ui->ue->id); + goto out; + } + + if (bind_unix_sk(sks[0], ui, false)) + goto out; + + ui->fdstore_mnt_id[0] = fdstore_add(sks[0]); + ui->fdstore_mnt_id[1] = fdstore_add(sks[1]); + if (ui->fdstore_mnt_id[0] < 0 || ui->fdstore_mnt_id[1] < 0) { + pr_err("bindmount: Can't add socketpair id %#x into fdstore\n", + ui->ue->id); + goto out; + } + + if (fchdir(prev_root_fd)) { + pr_perror("bindmount: Can't revert root directory"); + goto out; + } else if (chroot(".")) { + pr_perror("bindmount: Can't revert chroot "); + goto out; + } else if (fchdir(prev_cwd_fd)) { + pr_perror("bindmount: Can't revert working dir"); + goto out; + } + + /* + * Once we are pre-created and bounded, clear + * the USK_NOCWD flag so other sockets migh connect + * to us via relative name. + */ + ui->flags &= ~USK_NOCWD; + ret = 0; +out: + close_safe(&prev_cwd_fd); + close_safe(&prev_root_fd); + close_safe(&sks[0]); + close_safe(&sks[1]); + + if (ret == 0) + pr_debug("bindmount: Standalone socket moved into fdstore (id %#x ino %d peer %d)\n", + ui->ue->id, ui->ue->ino, ui->ue->peer); + + return ret; +} + static void set_peer(struct unix_sk_info *ui, struct unix_sk_info *peer) { ui->peer = peer; @@ -2206,7 +2496,7 @@ int add_fake_unix_queuers(void) struct unix_sk_info *ui; list_for_each_entry(ui, &unix_sockets, list) { - if ((ui->ue->uflags & (USK_EXTERN | USK_CALLBACK)) || ui->queuer) + if ((ui->ue->uflags & (UNIX_UFLAGS__EXTERN | UNIX_UFLAGS__CALLBACK)) || ui->queuer) continue; if (!(ui->ue->state == TCP_ESTABLISHED && !ui->peer) && ui->ue->type != SOCK_DGRAM) diff --git a/criu/sockets.c b/criu/sockets.c index 312b55c6dc..9f9ea49f3c 100644 --- a/criu/sockets.c +++ b/criu/sockets.c @@ -522,17 +522,23 @@ int restore_prepare_socket(int sk) return 0; } -int restore_socket_opts(int sk, SkOptsEntry *soe) +int restore_socket_bufsz(int sk, SkOptsEntry *soe) { - int ret = 0, val; - struct timeval tv; /* In kernel a bufsize value is doubled. */ - u32 bufs[2] = { soe->so_sndbuf / 2, soe->so_rcvbuf / 2}; + uint32_t bufs[2] = { soe->so_sndbuf / 2, soe->so_rcvbuf / 2}; pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf); /* setsockopt() multiplies the input values by 2 */ - ret |= userns_call(sk_setbufs, UNS_ASYNC, bufs, sizeof(bufs), sk); + return userns_call(sk_setbufs, UNS_ASYNC, bufs, sizeof(bufs), sk); +} + +int restore_socket_opts(int sk, SkOptsEntry *soe) +{ + int ret = 0, val; + struct timeval tv; + + ret |= restore_socket_bufsz(sk, soe); if (soe->has_so_priority) { pr_debug("\trestore priority %d for socket\n", soe->so_priority); @@ -547,30 +553,41 @@ int restore_socket_opts(int sk, SkOptsEntry *soe) ret |= restore_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark); } if (soe->has_so_passcred && soe->so_passcred) { - val = 1; pr_debug("\tset passcred for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_PASSCRED, &val); } if (soe->has_so_passsec && soe->so_passsec) { - val = 1; pr_debug("\tset passsec for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_PASSSEC, &val); } if (soe->has_so_dontroute && soe->so_dontroute) { - val = 1; pr_debug("\tset dontroute for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val); } if (soe->has_so_no_check && soe->so_no_check) { - val = 1; pr_debug("\tset no_check for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val); } if (soe->has_so_broadcast && soe->so_broadcast) { - val = 1; pr_debug("\tset broadcast for socket\n"); ret |= restore_opt(sk, SOL_SOCKET, SO_BROADCAST, &val); } + if (soe->has_so_keepalive && soe->so_keepalive) { + pr_debug("\tset keepalive for socket\n"); + ret |= restore_opt(sk, SOL_SOCKET, SO_KEEPALIVE, &val); + } + if (soe->has_tcp_keepcnt) { + pr_debug("\tset keepcnt for socket\n"); + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPCNT, &soe->tcp_keepcnt); + } + if (soe->has_tcp_keepidle) { + pr_debug("\tset keepidle for socket\n"); + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPIDLE, &soe->tcp_keepidle); + } + if (soe->has_tcp_keepintvl) { + pr_debug("\tset keepintvl for socket\n"); + ret |= restore_opt(sk, SOL_TCP, TCP_KEEPINTVL, &soe->tcp_keepintvl); + } tv.tv_sec = soe->so_snd_tmo_sec; tv.tv_usec = soe->so_snd_tmo_usec; @@ -656,6 +673,10 @@ int dump_socket_opts(int sk, SkOptsEntry *soe) soe->has_so_broadcast = true; soe->so_broadcast = val ? true : false; + ret |= dump_opt(sk, SOL_SOCKET, SO_KEEPALIVE, &val); + soe->has_so_keepalive = true; + soe->so_keepalive = val ? true : false; + ret |= dump_bound_dev(sk, soe); ret |= dump_socket_filter(sk, soe); diff --git a/criu/stats.c b/criu/stats.c index 7410b5ced3..891c378000 100644 --- a/criu/stats.c +++ b/criu/stats.c @@ -41,6 +41,18 @@ void cnt_add(int c, unsigned long val) BUG(); } +void cnt_sub(int c, unsigned long val) +{ + if (dstats != NULL) { + BUG_ON(c >= DUMP_CNT_NR_STATS); + dstats->counts[c] -= val; + } else if (rstats != NULL) { + BUG_ON(c >= RESTORE_CNT_NR_STATS); + atomic_add(-val, &rstats->counts[c]); + } else + BUG(); +} + static void timeval_accumulate(const struct timeval *from, const struct timeval *to, struct timeval *res) { diff --git a/criu/tls.c b/criu/tls.c index db9cc4f5a7..f7b94dee8c 100644 --- a/criu/tls.c +++ b/criu/tls.c @@ -31,7 +31,7 @@ static gnutls_certificate_credentials_t x509_cred; static int tls_sk = -1; static int tls_sk_flags = 0; -void tls_terminate_session() +void tls_terminate_session(void) { int ret; @@ -227,7 +227,7 @@ static int tls_x509_verify_peer_cert(void) return 0; } -static int tls_handshake() +static int tls_handshake(void) { int ret = -1; while (ret != GNUTLS_E_SUCCESS) { @@ -241,7 +241,7 @@ static int tls_handshake() return 0; } -static int tls_x509_setup_creds() +static int tls_x509_setup_creds(void) { int ret; char *cacert = CRIU_CACERT; diff --git a/criu/uffd.c b/criu/uffd.c index c47b35b1f8..99373c04de 100644 --- a/criu/uffd.c +++ b/criu/uffd.c @@ -40,6 +40,7 @@ #include "tls.h" #include "fdstore.h" #include "util.h" +#include "namespaces.h" #undef LOG_PREFIX #define LOG_PREFIX "uffd: " @@ -254,6 +255,13 @@ bool uffd_noncooperative(void) return (kdat.uffd_features & features) == features; } +static int uffd_api_ioctl(void *arg, int fd, pid_t pid) +{ + struct uffdio_api *uffdio_api = arg; + + return ioctl(fd, UFFDIO_API, uffdio_api); +} + int uffd_open(int flags, unsigned long *features) { struct uffdio_api uffdio_api = { 0 }; @@ -269,7 +277,8 @@ int uffd_open(int flags, unsigned long *features) if (features) uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) { + if (userns_call(uffd_api_ioctl, 0, &uffdio_api, sizeof(uffdio_api), + uffd)) { pr_perror("Failed to get uffd API"); goto err; } diff --git a/criu/util.c b/criu/util.c index 028f604bb8..f85c93a0cc 100644 --- a/criu/util.c +++ b/criu/util.c @@ -26,7 +26,8 @@ #include #include #include -#include + +#include "linux/mount.h" #include "kerndat.h" #include "page.h" @@ -324,7 +325,7 @@ int close_pid_proc(void) return 0; } -void close_proc() +void close_proc(void) { close_pid_proc(); close_service_fd(PROC_FD_OFF); @@ -421,29 +422,52 @@ int copy_file(int fd_in, int fd_out, size_t bytes) { ssize_t written = 0; size_t chunk = bytes ? bytes : 4096; + char *buffer; + ssize_t ret; + + buffer = xmalloc(chunk); + if (buffer == NULL) { + pr_perror("failed to allocate buffer to copy file"); + return -1; + } while (1) { - ssize_t ret; + if (opts.remote) { + ret = read(fd_in, buffer, chunk); + if (ret < 0) { + pr_perror("Can't read from fd_in\n"); + ret = -1; + goto err; + } + if (write(fd_out, buffer, ret) != ret) { + pr_perror("Couldn't write all read bytes\n"); + ret = -1; + goto err; + } + } else + ret = sendfile(fd_out, fd_in, NULL, chunk); - ret = sendfile(fd_out, fd_in, NULL, chunk); if (ret < 0) { pr_perror("Can't send data to ghost file"); - return -1; + ret = -1; + goto err; } if (ret == 0) { if (bytes && (written != bytes)) { pr_err("Ghost file size mismatch %zu/%zu\n", written, bytes); - return -1; + ret = -1; + goto err; } break; } written += ret; } - - return 0; +err: + xfree(buffer); + return ret; } int read_fd_link(int lfd, char *buf, size_t size) @@ -536,7 +560,7 @@ int cr_system_userns(int in, int out, int err, char *cmd, sigemptyset(&blockmask); sigaddset(&blockmask, SIGCHLD); if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) { - pr_perror("Can not set mask of blocked signals"); + pr_perror("Cannot set mask of blocked signals"); return -1; } @@ -545,6 +569,12 @@ int cr_system_userns(int in, int out, int err, char *cmd, pr_perror("fork() failed"); goto out; } else if (pid == 0) { + sigemptyset(&blockmask); + if (sigprocmask(SIG_SETMASK, &blockmask, NULL) == -1) { + pr_perror("Cannot clear blocked signals"); + goto out_chld; + } + if (userns_pid > 0) { if (switch_ns(userns_pid, &user_ns_desc, NULL)) goto out_chld; @@ -682,7 +712,7 @@ int cr_daemon(int nochdir, int noclose, int close_fd) return 0; } -int is_root_user() +int is_root_user(void) { if (geteuid() != 0) { pr_err("You need to be root to run this command\n"); @@ -978,80 +1008,7 @@ void tcp_nodelay(int sk, bool on) pr_perror("Unable to restore TCP_NODELAY (%d)", val); } -static inline void pr_xsym(unsigned char *data, size_t len, int pos) -{ - char sym; - - if (pos < len) - sym = data[pos]; - else - sym = ' '; - - pr_msg("%c", isprint(sym) ? sym : '.'); -} - -static inline void pr_xdigi(unsigned char *data, size_t len, int pos) -{ - if (pos < len) - pr_msg("%02x ", data[pos]); - else - pr_msg(" "); -} - -static int nice_width_for(unsigned long addr) -{ - int ret = 3; - - while (addr) { - addr >>= 4; - ret++; - } - - return ret; -} - -void print_data(unsigned long addr, unsigned char *data, size_t size) -{ - int i, j, addr_len; - unsigned zero_line = 0; - - addr_len = nice_width_for(addr + size); - - for (i = 0; i < size; i += 16) { - if (*(u64 *)(data + i) == 0 && *(u64 *)(data + i + 8) == 0) { - if (zero_line == 0) - zero_line = 1; - else { - if (zero_line == 1) { - pr_msg("*\n"); - zero_line = 2; - } - - continue; - } - } else - zero_line = 0; - - pr_msg("%#0*lx: ", addr_len, addr + i); - for (j = 0; j < 8; j++) - pr_xdigi(data, size, i + j); - pr_msg(" "); - for (j = 8; j < 16; j++) - pr_xdigi(data, size, i + j); - - pr_msg(" |"); - for (j = 0; j < 8; j++) - pr_xsym(data, size, i + j); - pr_msg(" "); - for (j = 8; j < 16; j++) - pr_xsym(data, size, i + j); - - pr_msg("|\n"); - } -} - -static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, - unsigned short port) +static int get_sockaddr_in(struct sockaddr_storage *addr, char *host) { memset(addr, 0, sizeof(*addr)); @@ -1069,26 +1026,26 @@ static int get_sockaddr_in(struct sockaddr_storage *addr, char *host, } if (addr->ss_family == AF_INET6) { - ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); + ((struct sockaddr_in6 *)addr)->sin6_port = htons(opts.port); } else if (addr->ss_family == AF_INET) { - ((struct sockaddr_in *)addr)->sin_port = htons(port); + ((struct sockaddr_in *)addr)->sin_port = htons(opts.port); } return 0; } -int setup_tcp_server(char *type, char *addr, unsigned short *port) +int setup_tcp_server(char *type) { int sk = -1; int sockopt = 1; struct sockaddr_storage saddr; socklen_t slen = sizeof(saddr); - if (get_sockaddr_in(&saddr, addr, (*port))) { + if (get_sockaddr_in(&saddr, opts.addr)) { return -1; } - pr_info("Starting %s server on port %u\n", type, *port); + pr_info("Starting %s server on port %u\n", type, opts.port); sk = socket(saddr.ss_family, SOCK_STREAM, IPPROTO_TCP); @@ -1114,19 +1071,19 @@ int setup_tcp_server(char *type, char *addr, unsigned short *port) } /* Get socket port in case of autobind */ - if ((*port) == 0) { + if (opts.port == 0) { if (getsockname(sk, (struct sockaddr *)&saddr, &slen)) { pr_perror("Can't get %s server name", type); goto out; } if (saddr.ss_family == AF_INET6) { - (*port) = ntohs(((struct sockaddr_in *)&saddr)->sin_port); + opts.port = ntohs(((struct sockaddr_in6 *)&saddr)->sin6_port); } else if (saddr.ss_family == AF_INET) { - (*port) = ntohs(((struct sockaddr_in6 *)&saddr)->sin6_port); + opts.port = ntohs(((struct sockaddr_in *)&saddr)->sin_port); } - pr_info("Using %u port\n", (*port)); + pr_info("Using %u port\n", opts.port); } return sk; @@ -1183,7 +1140,7 @@ int run_tcp_server(bool daemon_mode, int *ask, int cfd, int sk) return -1; } -int setup_tcp_client(char *hostname) +int setup_tcp_client(void) { struct sockaddr_storage saddr; struct addrinfo addr_criteria, *addr_list, *p; @@ -1198,10 +1155,10 @@ int setup_tcp_client(char *hostname) /* * addr_list contains a list of addrinfo structures that corresponding - * to the criteria specified in hostname and addr_criteria. + * to the criteria specified in opts.addr and addr_criteria. */ - if (getaddrinfo(hostname, NULL, &addr_criteria, &addr_list)) { - pr_perror("Failed to resolve hostname: %s", hostname); + if (getaddrinfo(opts.addr, NULL, &addr_criteria, &addr_list)) { + pr_perror("Failed to resolve hostname: %s", opts.addr); goto out; } @@ -1222,7 +1179,7 @@ int setup_tcp_client(char *hostname) inet_ntop(p->ai_family, ip, ipstr, sizeof(ipstr)); pr_info("Connecting to server %s:%u\n", ipstr, opts.port); - if (get_sockaddr_in(&saddr, ipstr, opts.port)) + if (get_sockaddr_in(&saddr, ipstr)) goto out; sk = socket(saddr.ss_family, SOCK_STREAM, IPPROTO_TCP); @@ -1417,3 +1374,27 @@ void print_stack_trace(pid_t pid) free(strings); } #endif + +int mount_detached_fs(const char *fsname) +{ + int fsfd, fd; + + fsfd = sys_fsopen(fsname, 0); + if (fsfd < 0) { + pr_perror("Unable to open the %s file system", fsname); + return -1; + } + + if (sys_fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0) < 0) { + pr_perror("Unable to create the %s file system", fsname); + close(fsfd); + return -1; + } + + fd = sys_fsmount(fsfd, 0, 0); + if (fd < 0) + pr_perror("Unable to mount the %s file system", fsname); + close(fsfd); + return fd; +} + diff --git a/criu/vdso.c b/criu/vdso.c index 50b8b8dba5..433a547286 100644 --- a/criu/vdso.c +++ b/criu/vdso.c @@ -20,7 +20,6 @@ #include "criu-log.h" #include "mem.h" #include "vma.h" -#include #include #ifdef LOG_PREFIX @@ -275,6 +274,10 @@ int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid, struct vma_area *vma; int fd = -1; + /* vDSO is not provided by kernel */ + if (kdat.vdso_sym.vdso_size == VDSO_BAD_SIZE) + return 0; + vcheck = get_vdso_check_type(ctl); if (vcheck == VDSO_CHECK_PFN) { BUG_ON(vdso_pfn == VDSO_BAD_PFN); @@ -534,21 +537,6 @@ static int vdso_fill_compat_symtable(struct vdso_maps *native, } #endif /* CONFIG_COMPAT */ -int vdso_init_dump(void) -{ - if (vdso_parse_maps(PROC_SELF, &vdso_maps)) { - pr_err("Failed reading self/maps for filling vdso/vvar bounds\n"); - return -1; - } - - if (kdat.pmap != PM_FULL) - pr_info("VDSO detection turned off\n"); - else if (vaddr_to_pfn(-1, vdso_maps.vdso_start, &vdso_pfn)) - return -1; - - return 0; -} - /* * Check vdso/vvar sized read from maps to kdat values. * We do not read /proc/self/maps for compatible vdso as it's @@ -566,11 +554,36 @@ static int is_kdat_vdso_sym_valid(void) return true; } -int vdso_init_restore(void) +int vdso_init_dump(void) { + if (vdso_parse_maps(PROC_SELF, &vdso_maps)) { + pr_err("Failed reading self/maps for filling vdso/vvar bounds\n"); + return -1; + } + + if (!is_kdat_vdso_sym_valid()) { + pr_err("Kdat sizes of vdso/vvar differ to maps file \n"); + return -1; + } + if (kdat.vdso_sym.vdso_size == VDSO_BAD_SIZE) { - pr_err("Kdat has empty vdso symtable\n"); + pr_debug("Kdat has empty vdso symtable - probably CONFIG_VDSO is not set\n"); + return 0; + } + + if (kdat.pmap != PM_FULL) + pr_info("VDSO detection turned off\n"); + else if (vaddr_to_pfn(-1, vdso_maps.vdso_start, &vdso_pfn)) return -1; + + return 0; +} + +int vdso_init_restore(void) +{ + if (kdat.vdso_sym.vdso_size == VDSO_BAD_SIZE) { + pr_debug("Kdat has empty vdso symtable - probably CONFIG_VDSO is not set\n"); + return 0; } /* Already filled vdso_maps during kdat test */ @@ -611,6 +624,12 @@ int kerndat_vdso_fill_symtable(void) return -1; } + if (!vdso_is_present(&vdso_maps)) { + pr_debug("Kernel doesn't premap vDSO - probably CONFIG_VDSO is not set\n"); + kdat.vdso_sym = vdso_maps.sym; + return 0; + } + if (vdso_fill_self_symtable(&vdso_maps)) { pr_err("Failed to fill self vdso symtable\n"); return -1; @@ -643,7 +662,7 @@ int kerndat_vdso_preserves_hint(void) kdat.vdso_hint_reliable = 0; - if (vdso_maps.vdso_start == VDSO_BAD_ADDR) + if (!vdso_is_present(&vdso_maps)) return 0; child = fork(); @@ -693,7 +712,7 @@ int kerndat_vdso_preserves_hint(void) goto out_kill; } - if (vdso_maps_after.vdso_start != VDSO_BAD_ADDR) + if (vdso_is_present(&vdso_maps_after)) kdat.vdso_hint_reliable = 1; ret = 0; diff --git a/images/Makefile b/images/Makefile index edaab06338..fba86b3058 100644 --- a/images/Makefile +++ b/images/Makefile @@ -63,6 +63,8 @@ proto-obj-y += sysctl.o proto-obj-y += autofs.o proto-obj-y += macvlan.o proto-obj-y += sit.o +proto-obj-y += remote-image.o +proto-obj-y += memfd.o CFLAGS += -iquote $(obj)/ diff --git a/images/core.proto b/images/core.proto index c3dba6f6d7..e90522914e 100644 --- a/images/core.proto +++ b/images/core.proto @@ -53,6 +53,8 @@ message task_core_entry { //optional int32 tty_pgrp = 17; optional bool child_subreaper = 18; + // Reserved for container relative start time + //optional uint64 start_time = 19; } message task_kobj_ids_entry { diff --git a/images/fdinfo.proto b/images/fdinfo.proto index 77e375aa94..d966d5bc5b 100644 --- a/images/fdinfo.proto +++ b/images/fdinfo.proto @@ -16,6 +16,7 @@ import "sk-unix.proto"; import "fifo.proto"; import "pipe.proto"; import "tty.proto"; +import "memfd.proto"; enum fd_types { UND = 0; @@ -36,6 +37,7 @@ enum fd_types { TUNF = 15; EXT = 16; TIMERFD = 17; + MEMFD = 18; /* Any number above the real used. Not stored to image */ CTL_TTY = 65534; @@ -70,4 +72,5 @@ message file_entry { optional fifo_entry fifo = 17; optional pipe_entry pipe = 18; optional tty_file_entry tty = 19; + optional memfd_file_entry memfd = 20; } diff --git a/images/ghost-file.proto b/images/ghost-file.proto index eda4664517..0576089fdd 100644 --- a/images/ghost-file.proto +++ b/images/ghost-file.proto @@ -15,6 +15,8 @@ message ghost_file_entry { optional timeval mtim = 8; optional bool chunks = 9; optional uint64 size = 10; + /* this field makes sense only when S_ISLNK(mode) */ + optional string symlnk_target = 11; } message ghost_chunk_entry { diff --git a/images/inventory.proto b/images/inventory.proto index 7bc2b0c022..d1438e8c8c 100644 --- a/images/inventory.proto +++ b/images/inventory.proto @@ -16,4 +16,5 @@ message inventory_entry { optional uint32 root_cg_set = 5; optional lsmtype lsmtype = 6; optional uint64 dump_uptime = 8; + optional uint32 pre_dump_mode = 9; } diff --git a/images/memfd.proto b/images/memfd.proto new file mode 100644 index 0000000000..546ffc2ab8 --- /dev/null +++ b/images/memfd.proto @@ -0,0 +1,21 @@ +syntax = "proto2"; + +import "opts.proto"; +import "fown.proto"; + +message memfd_file_entry { + required uint32 id = 1; + required uint32 flags = 2 [(criu).flags = "rfile.flags"]; + required uint64 pos = 3; + required fown_entry fown = 4; + required uint32 inode_id = 5; +}; + +message memfd_inode_entry { + required string name = 1; + required uint32 uid = 2; + required uint32 gid = 3; + required uint64 size = 4; + required uint32 shmid = 5; + required uint32 seals = 6 [(criu).flags = "seals.flags"]; +}; diff --git a/images/mnt.proto b/images/mnt.proto index 4160acbf62..8983395aea 100644 --- a/images/mnt.proto +++ b/images/mnt.proto @@ -28,6 +28,8 @@ enum fstype { // RPC_PIPEFS = 20; // NFS = 21; // NFS4 = 22; + + CGROUP2 = 23; }; message mnt_entry { diff --git a/images/netdev.proto b/images/netdev.proto index 476a92cedb..ae9c995316 100644 --- a/images/netdev.proto +++ b/images/netdev.proto @@ -71,4 +71,5 @@ message netns_entry { repeated netns_id nsids = 7; optional string ext_key = 8; + repeated sysctl_entry unix_conf = 9; } diff --git a/images/remote-image.proto b/images/remote-image.proto new file mode 100644 index 0000000000..f6b81503a0 --- /dev/null +++ b/images/remote-image.proto @@ -0,0 +1,22 @@ +syntax = "proto2"; + +message local_image_entry { + required string name = 1; + required string snapshot_id = 2; + required uint32 open_mode = 3; +} + +message remote_image_entry { + required string name = 1; + required string snapshot_id = 2; + required uint32 open_mode = 3; + required uint64 size = 4; +} + +message local_image_reply_entry { + required uint32 error = 1; +} + +message snapshot_id_entry { + required string snapshot_id = 1; +} diff --git a/images/rpc.proto b/images/rpc.proto index 15e677a775..df1b5aed2a 100644 --- a/images/rpc.proto +++ b/images/rpc.proto @@ -47,6 +47,11 @@ enum criu_cg_mode { DEFAULT = 6; }; +enum criu_pre_dump_mode { + SPLICE = 1; + VM_READ = 2; +}; + message criu_opts { required int32 images_dir_fd = 1; optional int32 pid = 2; /* if not set on dump, will dump requesting process */ @@ -120,6 +125,8 @@ message criu_opts { optional string tls_key = 57; optional bool tls = 58; optional bool tls_no_cn_verify = 59; + optional string cgroup_yard = 60; + optional criu_pre_dump_mode pre_dump_mode = 61 [default = SPLICE]; /* optional bool check_mounts = 128; */ } diff --git a/images/sk-opts.proto b/images/sk-opts.proto index c93ec5fd5c..336cca22ab 100644 --- a/images/sk-opts.proto +++ b/images/sk-opts.proto @@ -23,6 +23,10 @@ message sk_opts_entry { repeated fixed64 so_filter = 16; optional bool so_reuseport = 17; optional bool so_broadcast = 18; + optional bool so_keepalive = 19; + optional uint32 tcp_keepcnt = 20; + optional uint32 tcp_keepidle = 21; + optional uint32 tcp_keepintvl = 22; } enum sk_shutdown { diff --git a/images/sk-unix.proto b/images/sk-unix.proto index c59644f6ea..c24ca92fc8 100644 --- a/images/sk-unix.proto +++ b/images/sk-unix.proto @@ -10,6 +10,17 @@ message file_perms_entry { required uint32 gid = 3; } +/* + * Bitmask for unix_sk_entry::uflags + */ +enum unix_uflags { + EXTERN = 1; + SERVICE = 2; + CALLBACK = 4; + INHERIT = 8; + BINDMOUNT = 16; +} + message unix_sk_entry { /* * Few words about why we need both -- id and ino. diff --git a/include/common/compiler.h b/include/common/compiler.h index fc8abcfef4..1d431a5293 100644 --- a/include/common/compiler.h +++ b/include/common/compiler.h @@ -22,6 +22,7 @@ #define __used __attribute__((__used__)) #define __maybe_unused __attribute__((unused)) #define __always_unused __attribute__((unused)) +#define __must_check __attribute__((__warn_unused_result__)) #define __section(S) __attribute__ ((__section__(#S))) @@ -99,4 +100,30 @@ #define is_log2(v) (((v) & ((v) - 1)) == 0) +/* + * Use "__ignore_value" to avoid a warning when using a function declared with + * gcc's warn_unused_result attribute, but for which you really do want to + * ignore the result. Traditionally, people have used a "(void)" cast to + * indicate that a function's return value is deliberately unused. However, + * if the function is declared with __attribute__((warn_unused_result)), + * gcc issues a warning even with the cast. + * + * Caution: most of the time, you really should heed gcc's warning, and + * check the return value. However, in those exceptional cases in which + * you're sure you know what you're doing, use this function. + * + * Normally casting an expression to void discards its value, but GCC + * versions 3.4 and newer have __attribute__ ((__warn_unused_result__)) + * which may cause unwanted diagnostics in that case. Use __typeof__ + * and __extension__ to work around the problem, if the workaround is + * known to be needed. + * Written by Jim Meyering, Eric Blake and Pádraig Brady. + * (See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66425 for the details) + */ +#if 3 < __GNUC__ + (4 <= __GNUC_MINOR__) +# define __ignore_value(x) ({ __typeof__ (x) __x = (x); (void) __x; }) +#else +# define __ignore_value(x) ((void) (x)) +#endif + #endif /* __CR_COMPILER_H__ */ diff --git a/include/common/scm.h b/include/common/scm.h index ab27137b82..a8eb9ec4c7 100644 --- a/include/common/scm.h +++ b/include/common/scm.h @@ -3,7 +3,9 @@ #include #include +#include #include +#include /* * Because of kernel doing kmalloc for user data passed diff --git a/lib/c/criu.c b/lib/c/criu.c index 17d5c3983d..1d0a235f40 100644 --- a/lib/c/criu.c +++ b/lib/c/criu.c @@ -336,6 +336,21 @@ int criu_set_parent_images(const char *path) return criu_local_set_parent_images(global_opts, path); } +int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode) +{ + opts->rpc->has_pre_dump_mode = true; + if (mode == CRIU_PRE_DUMP_SPLICE || mode == CRIU_PRE_DUMP_READ) { + opts->rpc->pre_dump_mode = (CriuPreDumpMode)mode; + return 0; + } + return -1; +} + +int criu_set_pre_dump_mode(enum criu_pre_dump_mode mode) +{ + return criu_local_set_pre_dump_mode(global_opts, mode); +} + void criu_local_set_track_mem(criu_opts *opts, bool track_mem) { opts->rpc->has_track_mem = true; @@ -987,6 +1002,19 @@ int criu_local_add_cg_dump_controller(criu_opts *opts, const char *name) return 0; } +int criu_local_add_cg_yard(criu_opts *opts, const char *path) +{ + char *new; + + new = strdup(path); + if (!new) + return -ENOMEM; + + free(opts->rpc->cgroup_yard); + opts->rpc->cgroup_yard = new; + return 0; +} + int criu_add_skip_mnt(const char *mnt) { return criu_local_add_skip_mnt(global_opts, mnt); diff --git a/lib/c/criu.h b/lib/c/criu.h index 76f3547fcd..22db0fdcfd 100644 --- a/lib/c/criu.h +++ b/lib/c/criu.h @@ -43,6 +43,11 @@ enum criu_cg_mode { CRIU_CG_MODE_DEFAULT, }; +enum criu_pre_dump_mode { + CRIU_PRE_DUMP_SPLICE = 1, + CRIU_PRE_DUMP_READ = 2 +}; + int criu_set_service_address(const char *path); void criu_set_service_fd(int fd); int criu_set_service_binary(const char *path); @@ -95,6 +100,7 @@ int criu_add_irmap_path(const char *path); int criu_add_inherit_fd(int fd, const char *key); int criu_add_external(const char *key); int criu_set_page_server_address_port(const char *address, int port); +int criu_set_pre_dump_mode(enum criu_pre_dump_mode mode); /* * The criu_notify_arg_t na argument is an opaque @@ -207,9 +213,11 @@ int criu_local_add_irmap_path(criu_opts *opts, const char *path); int criu_local_add_cg_props(criu_opts *opts, const char *stream); int criu_local_add_cg_props_file(criu_opts *opts, const char *path); int criu_local_add_cg_dump_controller(criu_opts *opts, const char *name); +int criu_local_add_cg_yard(criu_opts *opts, const char *path); int criu_local_add_inherit_fd(criu_opts *opts, int fd, const char *key); int criu_local_add_external(criu_opts *opts, const char *key); int criu_local_set_page_server_address_port(criu_opts *opts, const char *address, int port); +int criu_local_set_pre_dump_mode(criu_opts *opts, enum criu_pre_dump_mode mode); void criu_local_set_notify_cb(criu_opts *opts, int (*cb)(char *action, criu_notify_arg_t na)); diff --git a/lib/py/images/images.py b/lib/py/images/images.py index f4517d8459..dca080657a 100644 --- a/lib/py/images/images.py +++ b/lib/py/images/images.py @@ -244,7 +244,7 @@ def load(self, f, pretty=False, no_payload=False): while True: gc = pb.ghost_chunk_entry() buf = f.read(4) - if buf == '': + if len(buf) == 0: break size, = struct.unpack('i', buf) gc.ParseFromString(f.read(size)) @@ -252,13 +252,13 @@ def load(self, f, pretty=False, no_payload=False): if no_payload: f.seek(gc.len, os.SEEK_CUR) else: - entry['extra'] = base64.encodebytes(f.read(gc.len)) + entry['extra'] = base64.encodebytes(f.read(gc.len)).decode('utf-8') entries.append(entry) else: if no_payload: f.seek(0, os.SEEK_END) else: - g_entry['extra'] = base64.encodebytes(f.read()) + g_entry['extra'] = base64.encodebytes(f.read()).decode('utf-8') entries.append(g_entry) return entries @@ -522,6 +522,8 @@ def skip(self, f, pbuff): 'AUTOFS': entry_handler(pb.autofs_entry), 'FILES': entry_handler(pb.file_entry), 'CPUINFO': entry_handler(pb.cpuinfo_entry), + 'MEMFD_FILE': entry_handler(pb.memfd_file_entry), + 'MEMFD_INODE': entry_handler(pb.memfd_inode_entry), } diff --git a/lib/py/images/pb2dict.py b/lib/py/images/pb2dict.py index daaa7297ea..40a6036cf4 100644 --- a/lib/py/images/pb2dict.py +++ b/lib/py/images/pb2dict.py @@ -1,12 +1,13 @@ -from google.protobuf.descriptor import FieldDescriptor as FD -import opts_pb2 -from ipaddress import IPv4Address, ip_address -from ipaddress import IPv6Address -import socket +import base64 import collections import os -import base64 import quopri +import socket +from ipaddress import IPv4Address, IPv6Address, ip_address + +from google.protobuf.descriptor import FieldDescriptor as FD + +import opts_pb2 if "encodebytes" not in dir(base64): base64.encodebytes = base64.encodestring @@ -105,11 +106,30 @@ def _custom_conv(field): ] rfile_flags_map = [ - ('O_WRONLY', 0o1), - ('O_RDWR', 0o2), - ('O_APPEND', 0o2000), - ('O_DIRECT', 0o40000), - ('O_LARGEFILE', 0o100000), + ('O_WRONLY', 0o00000001), + ('O_RDWR', 0o00000002), + ('O_CREAT', 0o00000100), + ('O_EXCL', 0o00000200), + ('O_NOCTTY', 0o00000400), + ('O_TRUNC', 0o00001000), + ('O_APPEND', 0o00002000), + ('O_NONBLOCK', 0o00004000), + ('O_DSYNC', 0o00010000), + ('FASYNC', 0o00020000), + ('O_DIRECT', 0o00040000), + ('O_LARGEFILE', 0o00100000), + ('O_DIRECTORY', 0o00200000), + ('O_NOFOLLOW', 0o00400000), + ('O_NOATIME', 0o01000000), + ('O_CLOEXEC', 0o02000000), +] + +seals_flags_map = [ + ('F_SEAL_SEAL', 0x0001), + ('F_SEAL_SHRINK', 0x0002), + ('F_SEAL_GROW', 0x0004), + ('F_SEAL_WRITE', 0x0008), + ('F_SEAL_FUTURE_WRITE', 0x0010), ] pmap_flags_map = [ @@ -124,6 +144,7 @@ def _custom_conv(field): 'mmap.status': mmap_status_map, 'rfile.flags': rfile_flags_map, 'pmap.flags': pmap_flags_map, + 'seals.flags': seals_flags_map, } gen_maps = { diff --git a/scripts/build/Dockerfile.aarch64-cross b/scripts/build/Dockerfile.aarch64-cross new file mode 100644 index 0000000000..252e0f8754 --- /dev/null +++ b/scripts/build/Dockerfile.aarch64-cross @@ -0,0 +1,45 @@ +FROM dockcross/base:latest + +# Add the cross compiler sources +RUN echo "deb http://ftp.us.debian.org/debian/ buster main" >> /etc/apt/sources.list && \ + dpkg --add-architecture arm64 && \ + apt-get install emdebian-archive-keyring + +RUN apt-get update && apt-get install -y \ + crossbuild-essential-arm64 \ + libc6-dev-arm64-cross \ + libc6-arm64-cross \ + libbz2-dev:arm64 \ + libexpat1-dev:arm64 \ + ncurses-dev:arm64 \ + libssl-dev:arm64 \ + protobuf-c-compiler \ + protobuf-compiler \ + python-protobuf \ + libnl-3-dev:arm64 \ + libprotobuf-dev:arm64 \ + libnet-dev:arm64 \ + libprotobuf-c-dev:arm64 \ + libcap-dev:arm64 \ + libaio-dev:arm64 \ + libnl-route-3-dev:arm64 + +ENV CROSS_TRIPLE=aarch64-linux-gnu +ENV CROSS_COMPILE=${CROSS_TRIPLE}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLE} \ + AS=/usr/bin/${CROSS_TRIPLE}-as \ + AR=/usr/bin/${CROSS_TRIPLE}-ar \ + CC=/usr/bin/${CROSS_TRIPLE}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLE}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLE}-g++ \ + LD=/usr/bin/${CROSS_TRIPLE}-ld \ + FC=/usr/bin/${CROSS_TRIPLE}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLE}/pkgconfig \ + ARCH=aarch64 + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.aarch64.hdr b/scripts/build/Dockerfile.aarch64.hdr deleted file mode 100644 index c90c980886..0000000000 --- a/scripts/build/Dockerfile.aarch64.hdr +++ /dev/null @@ -1,3 +0,0 @@ -FROM arm64v8/ubuntu:xenial - -COPY scripts/build/qemu-user-static/usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static diff --git a/scripts/build/Dockerfile.aarch64.tmpl b/scripts/build/Dockerfile.aarch64.tmpl deleted file mode 120000 index cb804790e6..0000000000 --- a/scripts/build/Dockerfile.aarch64.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.alpine b/scripts/build/Dockerfile.alpine index c71a3901f4..5785102dac 100644 --- a/scripts/build/Dockerfile.alpine +++ b/scripts/build/Dockerfile.alpine @@ -14,10 +14,11 @@ RUN apk update && apk add \ libcap-dev \ libnet-dev \ libnl3-dev \ + nftables \ pkgconfig \ protobuf-c-dev \ protobuf-dev \ - python \ + python3 \ sudo COPY . /criu @@ -27,20 +28,24 @@ RUN mv .ccache /tmp && make mrproper && ccache -sz && \ date && make -j $(nproc) CC="$CC" && date && ccache -s RUN apk add \ - py-yaml \ - py-pip \ - py2-future \ ip6tables \ iptables \ + nftables \ iproute2 \ tar \ bash \ go \ e2fsprogs \ + py-yaml \ + py3-flake8 \ asciidoctor # The rpc test cases are running as user #1000, let's add the user RUN adduser -u 1000 -D test -RUN pip install protobuf ipaddress junit_xml +RUN pip3 install protobuf junit_xml + +# For zdtm we need an unversioned python binary +RUN ln -s /usr/bin/python3 /usr/bin/python + RUN make -C test/zdtm diff --git a/scripts/build/Dockerfile.armv7-cross b/scripts/build/Dockerfile.armv7-cross new file mode 100644 index 0000000000..17a55561ec --- /dev/null +++ b/scripts/build/Dockerfile.armv7-cross @@ -0,0 +1,44 @@ +FROM dockcross/base:latest + +# Add the cross compiler sources +RUN echo "deb http://ftp.us.debian.org/debian/ buster main" >> /etc/apt/sources.list && \ + dpkg --add-architecture armhf && \ + apt-get install emdebian-archive-keyring + +RUN apt-get update && apt-get install -y \ + crossbuild-essential-armhf \ + libbz2-dev:armhf \ + libexpat1-dev:armhf \ + ncurses-dev:armhf \ + libssl-dev:armhf \ + protobuf-c-compiler \ + protobuf-compiler \ + python-protobuf \ + libnl-3-dev:armhf \ + libprotobuf-dev:armhf \ + libnet-dev:armhf \ + libprotobuf-c-dev:armhf \ + libcap-dev:armhf \ + libaio-dev:armhf \ + libnl-route-3-dev:armhf + +ENV CROSS_TRIPLE=arm-linux-gnueabihf +ENV CROSS_COMPILE=${CROSS_TRIPLE}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLE} \ + AS=/usr/bin/${CROSS_TRIPLE}-as \ + AR=/usr/bin/${CROSS_TRIPLE}-ar \ + CC=/usr/bin/${CROSS_TRIPLE}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLE}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLE}-g++ \ + LD=/usr/bin/${CROSS_TRIPLE}-ld \ + FC=/usr/bin/${CROSS_TRIPLE}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLE}/pkgconfig \ + ARCH=arm \ + SUBARCH=armv7 + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.armv7hf.hdr b/scripts/build/Dockerfile.armv7hf.hdr index d453d6df70..7c66474e56 100644 --- a/scripts/build/Dockerfile.armv7hf.hdr +++ b/scripts/build/Dockerfile.armv7hf.hdr @@ -1,3 +1 @@ -FROM arm32v7/ubuntu:xenial - -COPY scripts/build/qemu-user-static/usr/bin/qemu-arm-static /usr/bin/qemu-arm-static +FROM arm32v7/ubuntu:bionic diff --git a/scripts/build/Dockerfile.armv7hf.tmpl b/scripts/build/Dockerfile.armv7hf.tmpl index cb804790e6..7bc6d9cde9 120000 --- a/scripts/build/Dockerfile.armv7hf.tmpl +++ b/scripts/build/Dockerfile.armv7hf.tmpl @@ -1 +1 @@ -Dockerfile.tmpl \ No newline at end of file +Dockerfile.linux32.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.centos b/scripts/build/Dockerfile.centos index 2ce40b179b..213be694fb 100644 --- a/scripts/build/Dockerfile.centos +++ b/scripts/build/Dockerfile.centos @@ -23,6 +23,7 @@ RUN yum install -y \ protobuf-devel \ protobuf-python \ python \ + python-flake8 \ python-ipaddress \ python2-future \ python2-junit_xml \ diff --git a/scripts/build/Dockerfile.fedora-rawhide-aarch64.hdr b/scripts/build/Dockerfile.fedora-rawhide-aarch64.hdr deleted file mode 100644 index 82f29e3365..0000000000 --- a/scripts/build/Dockerfile.fedora-rawhide-aarch64.hdr +++ /dev/null @@ -1,3 +0,0 @@ -FROM arm64v8/fedora:rawhide - -COPY scripts/build/qemu-user-static/usr/bin/qemu-aarch64-static /usr/bin/qemu-aarch64-static diff --git a/scripts/build/Dockerfile.fedora-rawhide-aarch64.tmpl b/scripts/build/Dockerfile.fedora-rawhide-aarch64.tmpl deleted file mode 120000 index e4c40309c5..0000000000 --- a/scripts/build/Dockerfile.fedora-rawhide-aarch64.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.fedora.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.fedora.tmpl b/scripts/build/Dockerfile.fedora.tmpl index 280ce1cdd8..138588bce1 100644 --- a/scripts/build/Dockerfile.fedora.tmpl +++ b/scripts/build/Dockerfile.fedora.tmpl @@ -3,12 +3,15 @@ ARG ENV1=FOOBAR RUN dnf install -y \ ccache \ + diffutils \ findutils \ gcc \ git \ gnutls-devel \ iproute \ iptables \ + nftables \ + nftables-devel \ libaio-devel \ libasan \ libcap-devel \ @@ -30,12 +33,6 @@ RUN dnf install -y \ rubygem-asciidoctor \ kmod -# Replace coreutils-single with "traditional" coreutils -# to fix the following error on Fedora 28/rawhide while -# running under QEMU: -# > sh: /usr/bin/sort: /usr/bin/coreutils: bad interpreter: No such file or directory -RUN dnf install -y --allowerasing coreutils - RUN ln -sf python3 /usr/bin/python ENV PYTHON=python3 diff --git a/scripts/build/Dockerfile.linux32.tmpl b/scripts/build/Dockerfile.linux32.tmpl new file mode 100644 index 0000000000..5d3fe5139f --- /dev/null +++ b/scripts/build/Dockerfile.linux32.tmpl @@ -0,0 +1,47 @@ +ARG CC=gcc +ARG ENV1=FOOBAR + +RUN apt-get update && apt-get install -y \ + ccache \ + libnet-dev \ + libnl-route-3-dev \ + $CC \ + bsdmainutils \ + build-essential \ + git-core \ + iptables \ + libaio-dev \ + libcap-dev \ + libgnutls28-dev \ + libgnutls30 \ + libnl-3-dev \ + libprotobuf-c-dev \ + libprotobuf-dev \ + libselinux-dev \ + pkg-config \ + protobuf-c-compiler \ + protobuf-compiler \ + python-minimal \ + python-future + +COPY . /criu +WORKDIR /criu +ENV CC="ccache $CC" CCACHE_DIR=/tmp/.ccache CCACHE_NOCOMPRESS=1 $ENV1=yes + +RUN uname -m && setarch linux32 uname -m && setarch --list + +RUN mv .ccache /tmp && make mrproper && ccache -s && \ + date && \ +# Check single object build + setarch linux32 make -j $(nproc) CC="$CC" criu/parasite-syscall.o && \ +# Compile criu + setarch linux32 make -j $(nproc) CC="$CC" && \ + date && \ +# Check that "make mrproper" works + setarch linux32 make mrproper && ! git clean -ndx --exclude=scripts/build \ + --exclude=.config --exclude=test | grep . + +# Compile tests +RUN date && setarch linux32 make -j $(nproc) CC="$CC" -C test/zdtm && date + +#RUN make test/compel/handle_binary && ./test/compel/handle_binary diff --git a/scripts/build/Dockerfile.openj9-alpine b/scripts/build/Dockerfile.openj9-alpine new file mode 100644 index 0000000000..39ea4d08e6 --- /dev/null +++ b/scripts/build/Dockerfile.openj9-alpine @@ -0,0 +1,30 @@ +FROM adoptopenjdk/openjdk8-openj9:alpine + +RUN apk update && apk add \ + bash \ + build-base \ + ccache \ + coreutils \ + git \ + gnutls-dev \ + libaio-dev \ + libcap-dev \ + libnet-dev \ + libnl3-dev \ + pkgconfig \ + protobuf-c-dev \ + protobuf-dev \ + python3 \ + sudo \ + maven \ + ip6tables \ + iptables \ + bash + +COPY . /criu +WORKDIR /criu + +RUN make + +ENTRYPOINT mvn -f test/javaTests/pom.xml test + diff --git a/scripts/build/Dockerfile.openj9-ubuntu b/scripts/build/Dockerfile.openj9-ubuntu new file mode 100644 index 0000000000..f235cc0047 --- /dev/null +++ b/scripts/build/Dockerfile.openj9-ubuntu @@ -0,0 +1,31 @@ +FROM adoptopenjdk/openjdk8-openj9:latest + +RUN apt-get update && apt-get install -y --no-install-recommends protobuf-c-compiler \ + libprotobuf-c-dev \ + libaio-dev \ + python-future \ + libprotobuf-dev \ + protobuf-compiler \ + libcap-dev \ + libnl-3-dev \ + gdb \ + bash \ + python-protobuf \ + python-yaml \ + libnet-dev \ + libnl-route-3-dev \ + libbsd-dev \ + make \ + git \ + pkg-config \ + iptables \ + gcc \ + maven + +COPY . /criu +WORKDIR /criu + +RUN make + +ENTRYPOINT mvn -f test/javaTests/pom.xml test + diff --git a/scripts/build/Dockerfile.ppc64-cross b/scripts/build/Dockerfile.ppc64-cross new file mode 100644 index 0000000000..44061c558c --- /dev/null +++ b/scripts/build/Dockerfile.ppc64-cross @@ -0,0 +1,45 @@ +FROM dockcross/base:latest + +# Add the cross compiler sources +RUN echo "deb http://ftp.us.debian.org/debian/ buster main" >> /etc/apt/sources.list && \ + dpkg --add-architecture ppc64el && \ + apt-get install emdebian-archive-keyring + +RUN apt-get update && apt-get install -y \ + crossbuild-essential-ppc64el \ + libc6-dev-ppc64el-cross \ + libc6-ppc64el-cross \ + libbz2-dev:ppc64el \ + libexpat1-dev:ppc64el \ + ncurses-dev:ppc64el \ + libssl-dev:ppc64el \ + protobuf-c-compiler \ + protobuf-compiler \ + python-protobuf \ + libnl-3-dev:ppc64el \ + libprotobuf-dev:ppc64el \ + libnet-dev:ppc64el \ + libprotobuf-c-dev:ppc64el \ + libcap-dev:ppc64el \ + libaio-dev:ppc64el \ + libnl-route-3-dev:ppc64el + +ENV CROSS_TRIPLE=powerpc64le-linux-gnu +ENV CROSS_COMPILE=${CROSS_TRIPLE}- \ + CROSS_ROOT=/usr/${CROSS_TRIPLE} \ + AS=/usr/bin/${CROSS_TRIPLE}-as \ + AR=/usr/bin/${CROSS_TRIPLE}-ar \ + CC=/usr/bin/${CROSS_TRIPLE}-gcc \ + CPP=/usr/bin/${CROSS_TRIPLE}-cpp \ + CXX=/usr/bin/${CROSS_TRIPLE}-g++ \ + LD=/usr/bin/${CROSS_TRIPLE}-ld \ + FC=/usr/bin/${CROSS_TRIPLE}-gfortran + +ENV PATH="${PATH}:${CROSS_ROOT}/bin" \ + PKG_CONFIG_PATH=/usr/lib/${CROSS_TRIPLE}/pkgconfig \ + ARCH=ppc64 + +COPY . /criu +WORKDIR /criu + +RUN make mrproper && date && make -j $(nproc) zdtm && date diff --git a/scripts/build/Dockerfile.ppc64le.hdr b/scripts/build/Dockerfile.ppc64le.hdr deleted file mode 100644 index ba65901c2b..0000000000 --- a/scripts/build/Dockerfile.ppc64le.hdr +++ /dev/null @@ -1,5 +0,0 @@ -FROM ppc64le/ubuntu:xenial - -ENV QEMU_CPU POWER8 -COPY scripts/build/qemu-user-static/usr/bin/qemu-ppc64le-static /usr/bin/qemu-ppc64le-static -RUN sed -i '/security/ d' /etc/apt/sources.list diff --git a/scripts/build/Dockerfile.ppc64le.tmpl b/scripts/build/Dockerfile.ppc64le.tmpl deleted file mode 120000 index cb804790e6..0000000000 --- a/scripts/build/Dockerfile.ppc64le.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.tmpl \ No newline at end of file diff --git a/scripts/build/Dockerfile.s390x.hdr b/scripts/build/Dockerfile.s390x.hdr deleted file mode 100644 index e02097f625..0000000000 --- a/scripts/build/Dockerfile.s390x.hdr +++ /dev/null @@ -1,6 +0,0 @@ -FROM s390x/debian:latest - -ENV QEMU_CPU z900 -COPY scripts/build/qemu-user-static/usr/bin/qemu-s390x-static /usr/bin/qemu-s390x-static -# The security repository does not seem to exist anymore -RUN sed -i '/security/ d' /etc/apt/sources.list diff --git a/scripts/build/Dockerfile.s390x.tmpl b/scripts/build/Dockerfile.s390x.tmpl deleted file mode 120000 index cb804790e6..0000000000 --- a/scripts/build/Dockerfile.s390x.tmpl +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.tmpl \ No newline at end of file diff --git a/scripts/build/Makefile b/scripts/build/Makefile index bb2e9ca9d2..855539152f 100644 --- a/scripts/build/Makefile +++ b/scripts/build/Makefile @@ -1,8 +1,8 @@ -QEMU_ARCHES := armv7hf aarch64 ppc64le s390x fedora-rawhide-aarch64 # require qemu -ARCHES := $(QEMU_ARCHES) x86_64 fedora-asan fedora-rawhide centos +ARCHES := x86_64 fedora-asan fedora-rawhide centos armv7hf TARGETS := $(ARCHES) alpine TARGETS_CLANG := $(addsuffix $(TARGETS),-clang) CONTAINER_RUNTIME := docker +TARGETS += armv7-cross aarch64-cross ppc64-cross all: $(TARGETS) $(TARGETS_CLANG) .PHONY: all @@ -16,15 +16,6 @@ $(foreach arch,$(ARCHES),$(eval $(call ARCH_DEP,$(arch)))) Dockerfile.%: Dockerfile.%.hdr Dockerfile.%.tmpl cat $^ > $@ -qemu-user-static: - ./extract-deb-pkg qemu-user-static - -binfmt_misc: - ./binfmt_misc -.PHONY: binfmt_misc - -$(QEMU_ARCHES): qemu-user-static binfmt_misc - $(TARGETS): mkdir -p $(HOME)/.ccache mv $(HOME)/.ccache ../../ @@ -42,12 +33,3 @@ $(foreach t,$(TARGETS),$(eval $(call CLANG_DEP,$(t)))) %-clang: DB_ENV=--build-arg ENV1=CCACHE_CPP2 s390x-clang: DB_CC=--build-arg CC=clang-3.8 .PHONY: $(TARGETS_CLANG) - -clean: - rm -rf qemu-user-static - for ARCH in $(ARCHES); do \ - FILE=/proc/sys/fs/binfmt_misc/$$ARCH; \ - test -f $$FILE && echo -1 > $$FILE; \ - rm -f Dockerfile.$$ARCH; \ - done -.PHONY: clean diff --git a/scripts/build/binfmt_misc b/scripts/build/binfmt_misc deleted file mode 100755 index bf2a2ecad6..0000000000 --- a/scripts/build/binfmt_misc +++ /dev/null @@ -1,13 +0,0 @@ -set -e -x - -test -f /proc/sys/fs/binfmt_misc/armv7hf || - echo ':armv7hf:M::\x7fELF\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x28\x00:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff:/usr/bin/qemu-arm-static:' > /proc/sys/fs/binfmt_misc/register; - -test -f /proc/sys/fs/binfmt_misc/aarch64 || - echo ':aarch64:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb7:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/bin/qemu-aarch64-static:' > /proc/sys/fs/binfmt_misc/register - -test -f /proc/sys/fs/binfmt_misc/ppc64le || - echo ':ppc64le:M::\x7fELF\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x15\x00:\xff\xff\xff\xff\xff\xff\xff\xfc\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\x00:/usr/bin/qemu-ppc64le-static:' > /proc/sys/fs/binfmt_misc/register - -test -f /proc/sys/fs/binfmt_misc/s390x || - echo ':s390x:M::\x7fELF\x02\x02\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x16:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff:/usr/bin/qemu-s390x-static:' > /proc/sys/fs/binfmt_misc/register diff --git a/scripts/build/extract-deb-pkg b/scripts/build/extract-deb-pkg deleted file mode 100755 index 44457bc5a4..0000000000 --- a/scripts/build/extract-deb-pkg +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -set -e -set -u -set -o pipefail -MIRROR="https://mirrors.kernel.org/ubuntu" -PKGS="$MIRROR/dists/bionic/universe/binary-amd64/Packages.gz" - -if [ $# -ne 1 ]; then - echo "Usage: $0 package-name" 1>&2 - exit 1 -fi - -if [ -d "$1" ]; then - echo "Directory $1 already exists -- exiting" - exit 0 -fi - -if ! pkg=$(curl -sSL "$PKGS" | zgrep "Filename.*$1" | awk '{ print $2 }'); then - echo "ERROR: no packages matching $1" 1>&2 - exit 1 -fi - -if [ "$(wc -w <<< "$pkg")" -gt 1 ]; then - echo "$pkg" 1>&2 - echo "ERROR: more than one match for $1" 1>&2 - exit 1 -fi - -mkdir "$1" -cd "$1" - -wget "$MIRROR/$pkg" -pkg=$(basename "$pkg") -ar vx "$pkg" -tar xJvf data.tar.xz diff --git a/scripts/criu-ns b/scripts/criu-ns new file mode 100755 index 0000000000..e065c59716 --- /dev/null +++ b/scripts/criu-ns @@ -0,0 +1,252 @@ +#!/usr/bin/env python +import ctypes +import ctypes.util +import errno +import sys +import os + +# constants for unshare +CLONE_NEWNS = 0x00020000 +CLONE_NEWPID = 0x20000000 + +# - constants for mount +MS_REC = 16384 +MS_PRIVATE = 1 << 18 +MS_SLAVE = 1 << 19 + +# Load libc bindings +_libc = ctypes.CDLL(ctypes.util.find_library("c"), use_errno=True) + +try: + _unshare = _libc.unshare +except AttributeError: + raise OSError(errno.EINVAL, "unshare is not supported on this platform") +else: + _unshare.argtypes = [ ctypes.c_int ] + _unshare.restype = ctypes.c_int + +try: + _setns = _libc.setns +except AttributeError: + raise OSError(errno.EINVAL, "setns is not supported on this platform") +else: + _setns.argtypes = [ ctypes.c_int, ctypes.c_int ] + _setns.restype = ctypes.c_int + +try: + _mount = _libc.mount +except AttributeError: + raise OSError(errno.EINVAL, "mount is not supported on this platform") +else: + _mount.argtypes = [ + ctypes.c_char_p, + ctypes.c_char_p, + ctypes.c_char_p, + ctypes.c_ulong, + ctypes.c_void_p + ] + _mount.restype = ctypes.c_int + +try: + _umount = _libc.umount +except AttributeError: + raise OSError(errno.EINVAL, "umount is not supported on this platform") +else: + _umount.argtypes = [ctypes.c_char] + _umount.restype = ctypes.c_int + + +def run_criu(): + print(sys.argv) + os.execlp('criu', *['criu'] + sys.argv[1:]) + + +def wrap_restore(): + # Unshare pid and mount namespaces + if _unshare(CLONE_NEWNS | CLONE_NEWPID) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + (r_pipe, w_pipe) = os.pipe() + + # Spawn the init + if os.fork() == 0: + os.close(r_pipe) + + # Mount new /proc + if _mount(None, b"/", None, MS_SLAVE|MS_REC, None) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + if _mount(b'proc', b'/proc', b'proc', 0, None) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + # Spawn CRIU binary + criu_pid = os.fork() + if criu_pid == 0: + run_criu() + raise OSError(errno.ENOENT, "No such command") + + while True: + try: + (pid, status) = os.wait() + if pid == criu_pid: + status = os.WEXITSTATUS(status) + break + except OSError: + status = -251 + break + + os.write(w_pipe, b"%d" % status) + os.close(w_pipe) + + if status != 0: + sys.exit(status) + + while True: + try: + os.wait() + except OSError: + break + + sys.exit(0) + + # Wait for CRIU to exit and report the status back + os.close(w_pipe) + status = os.read(r_pipe, 1024) + if not status.isdigit(): + status_i = -252 + else: + status_i = int(status) + + return status_i + + +def get_varg(args): + for i in range(1, len(sys.argv)): + if not sys.argv[i] in args: + continue + + if i + 1 >= len(sys.argv): + break + + return (sys.argv[i + 1], i + 1) + + return (None, None) + + + +def set_pidns(tpid, pid_idx): + # Joind pid namespace. Note, that the given pid should + # be changed in -t option, as task lives in different + # pid namespace. + + myns = os.stat('/proc/self/ns/pid').st_ino + + ns_fd = os.open('/proc/%s/ns/pid' % tpid, os.O_RDONLY) + if myns != os.fstat(ns_fd).st_ino: + + for l in open('/proc/%s/status' % tpid): + if not l.startswith('NSpid:'): + continue + + ls = l.split() + if ls[1] != tpid: + raise OSError(errno.ESRCH, 'No such pid') + + print('Replace pid {} with {}'.format(tpid, ls[2])) + sys.argv[pid_idx] = ls[2] + break + else: + raise OSError(errno.ENOENT, 'Cannot find NSpid field in proc') + + if _setns(ns_fd, 0) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + os.close(ns_fd) + + +def set_mntns(tpid): + # Join mount namespace. Trick here too -- check / and . + # will be the same in target mntns. + + myns = os.stat('/proc/self/ns/mnt').st_ino + ns_fd = os.open('/proc/%s/ns/mnt' % tpid, os.O_RDONLY) + if myns != os.fstat(ns_fd).st_ino: + root_st = os.stat('/') + cwd_st = os.stat('.') + cwd_path = os.path.realpath('.') + + if _setns(ns_fd, 0) != 0: + _errno = ctypes.get_errno() + raise OSError(_errno, errno.errorcode[_errno]) + + os.chdir(cwd_path) + root_nst = os.stat('/') + cwd_nst = os.stat('.') + + def steq(st, nst): + return (st.st_dev, st.st_ino) == (nst.st_dev, nst.st_ino) + + if not steq(root_st, root_nst): + raise OSError(errno.EXDEV, 'Target ns / is not as current') + if not steq(cwd_st, cwd_nst): + raise OSError(errno.EXDEV, 'Target ns . is not as current') + + + os.close(ns_fd) + + +def wrap_dump(): + (pid, pid_idx) = get_varg(('-t', '--tree')) + if pid is None: + raise OSError(errno.EINVAL, 'No --tree option given') + + set_pidns(pid, pid_idx) + set_mntns(pid) + + # Spawn CRIU binary + criu_pid = os.fork() + if criu_pid == 0: + run_criu() + raise OSError(errno.ENOENT, "No such command") + + # Wait for CRIU to exit and report the status back + while True: + try: + (pid, status) = os.wait() + if pid == criu_pid: + status = os.WEXITSTATUS(status) + break + except OSError: + status = -251 + break + + return status + + +if len(sys.argv) == 1: + print(""" +Usage: + {0} dump|pre-dump -t PID [] + {0} restore [] +\nCommands: + dump checkpoint a process/tree identified by pid + pre-dump pre-dump task(s) minimizing their frozen time + restore restore a process/tree +""".format(sys.argv[0])) + exit(1) + +action = sys.argv[1] + +if action == 'restore': + res = wrap_restore() +elif action == 'dump' or action == 'pre-dump': + res = wrap_dump() +else: + print('Unsupported action {} for nswrap'.format(action)) + res = -1 + +sys.exit(res) diff --git a/scripts/feature-tests.mak b/scripts/feature-tests.mak index e39d97bb11..21b3900923 100644 --- a/scripts/feature-tests.mak +++ b/scripts/feature-tests.mak @@ -136,3 +136,49 @@ ENTRY(main) nop END(main) endef + +define FEATURE_TEST_FSCONFIG + +#include + +int main(void) +{ + if (FSCONFIG_CMD_CREATE > 0) + return 0; + return 0; +} + +endef + +define FEATURE_TEST_NFTABLES_LIB_API_0 + +#include + +int main(int argc, char **argv) +{ + return nft_run_cmd_from_buffer(nft_ctx_new(NFT_CTX_DEFAULT), \"cmd\", strlen(\"cmd\")); +} + +endef + +define FEATURE_TEST_NFTABLES_LIB_API_1 + +#include + +int main(int argc, char **argv) +{ + return nft_run_cmd_from_buffer(nft_ctx_new(NFT_CTX_DEFAULT), \"cmd\"); +} + +endef + +define FEATURE_TEST_MEMFD_CREATE + +#include +#include + +int main(void) +{ + return memfd_create(NULL, 0); +} +endef diff --git a/scripts/nmk/scripts/include.mk b/scripts/nmk/scripts/include.mk index e1701103f9..c1c1e94af4 100644 --- a/scripts/nmk/scripts/include.mk +++ b/scripts/nmk/scripts/include.mk @@ -8,23 +8,21 @@ endif # # Common vars. -SUBARCH := $(shell uname -m | sed \ - -e s/i.86/x86/ \ - -e s/x86_64/x86/ \ - -e s/sun4u/sparc64/ \ - -e s/arm.*/arm/ \ - -e s/sa110/arm/ \ - -e s/s390x/s390/ \ - -e s/parisc64/parisc/ \ - -e s/ppc64.*/ppc64/ \ - -e s/mips.*/mips/ \ - -e s/sh[234].*/sh/ \ +SUBARCH ?= $(shell uname -m) +ARCH ?= $(shell echo $(SUBARCH) | sed \ + -e s/i.86/x86/ \ + -e s/x86_64/x86/ \ + -e s/sun4u/sparc64/ \ + -e s/arm.*/arm/ \ + -e s/sa110/arm/ \ + -e s/s390x/s390/ \ + -e s/parisc64/parisc/ \ + -e s/ppc64.*/ppc64/ \ + -e s/mips.*/mips/ \ + -e s/sh[234].*/sh/ \ -e s/aarch64.*/aarch64/) -ARCH ?= $(SUBARCH) -SRCARCH := $(ARCH) - -export SUBARCH ARCH SRCARCH +export SUBARCH ARCH ifndef ____nmk_defined__tools include $(__nmk_dir)tools.mk diff --git a/scripts/nmk/scripts/utils.mk b/scripts/nmk/scripts/utils.mk index 0cf216bc06..b9790615ca 100644 --- a/scripts/nmk/scripts/utils.mk +++ b/scripts/nmk/scripts/utils.mk @@ -3,7 +3,7 @@ ifndef ____nmk_defined__utils # # Usage: option := $(call try-compile,language,source-to-build,cc-options,cc-defines) try-compile = $(shell sh -c 'echo "$(2)" | \ - $(CC) $(4) -x $(1) - $(3) -o /dev/null > /dev/null 2>&1 && \ + $(CC) $(CFLAGS) $(LDFLAGS) $(4) -x $(1) - $(3) -o /dev/null > /dev/null 2>&1 && \ echo true || echo false') # diff --git a/scripts/travis/Makefile b/scripts/travis/Makefile index baddd6eb10..17abb703a8 100644 --- a/scripts/travis/Makefile +++ b/scripts/travis/Makefile @@ -13,6 +13,9 @@ endif TARGETS := alpine fedora-rawhide centos ZDTM_OPTIONS := +UNAME := $(shell uname -m) + +export UNAME alpine: ZDTM_OPTIONS=-x zdtm/static/binfmt_misc -x zdtm/static/netns-nf -x zdtm/static/sched_policy00 -x zdtm/static/seccomp_strict -x zdtm/static/sigaltstack -x zdtm/static/signalfd00 -x zdtm/static/config_inotify_irmap @@ -23,17 +26,31 @@ define DOCKER_JSON endef export DOCKER_JSON -$(TARGETS): - echo "$$DOCKER_JSON" > /etc/docker/daemon.json - systemctl restart docker + +ifeq ($(UNAME),x86_64) + # On anything besides x86_64 Travis is running unprivileged LXD + # containers which do not support running docker with '--privileged'. + CONTAINER_OPTS := --rm -it --privileged -v /lib/modules:/lib/modules --tmpfs /run +else + CONTAINER_OPTS := --rm -v /lib/modules:/lib/modules --tmpfs /run +endif + +restart-docker: + if [ "$$UNAME" = "x86_64" ]; then \ + echo "$$DOCKER_JSON" > /etc/docker/daemon.json; \ + cat /etc/docker/daemon.json; \ + systemctl status docker; \ + systemctl restart docker; \ + systemctl status docker; \ + fi + +$(TARGETS): restart-docker $(MAKE) -C ../build $@$(target-suffix) - docker run --env-file docker.env --rm -it --privileged -v /lib/modules:/lib/modules --tmpfs /run criu-$@ scripts/travis/travis-tests + docker run --env-file docker.env $(CONTAINER_OPTS) criu-$@ scripts/travis/travis-tests -fedora-asan: - echo "$$DOCKER_JSON" > /etc/docker/daemon.json - systemctl restart docker +fedora-asan: restart-docker $(MAKE) -C ../build $@$(target-suffix) - docker run --rm -it --privileged -v /lib/modules:/lib/modules --tmpfs /run criu-$@ ./scripts/travis/asan.sh $(ZDTM_OPTIONS) + docker run -it $(CONTAINER_OPTS) criu-$@ ./scripts/travis/asan.sh $(ZDTM_OPTIONS) docker-test: ./docker-test.sh @@ -41,5 +58,11 @@ docker-test: podman-test: ./podman-test.sh +# overlayfs behaves differently on Ubuntu and breaks CRIU +# https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 +# Switch to devicemapper +openj9-test: restart-docker + ./openj9-test.sh + %: $(MAKE) -C ../build $@$(target-suffix) diff --git a/scripts/travis/docker-test.sh b/scripts/travis/docker-test.sh index ee96fef48d..ac420a4450 100755 --- a/scripts/travis/docker-test.sh +++ b/scripts/travis/docker-test.sh @@ -19,11 +19,16 @@ apt-get update -qq apt-get install -qq docker-ce -cat > /etc/docker/daemon.json < /etc/docker/daemon.json +else + echo '{ "experimental": true }' > /etc/docker/daemon.json +fi service docker restart diff --git a/scripts/travis/openj9-test.sh b/scripts/travis/openj9-test.sh new file mode 100755 index 0000000000..968f064f85 --- /dev/null +++ b/scripts/travis/openj9-test.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +cd ../.. + +failures="" + +docker build -t criu-openj9-ubuntu-test:latest -f scripts/build/Dockerfile.openj9-ubuntu . +docker run --rm --privileged criu-openj9-ubuntu-test:latest +if [ $? -ne 0 ]; then + failures=`echo "$failures ubuntu"` +fi + +docker build -t criu-openj9-alpine-test:latest -f scripts/build/Dockerfile.openj9-alpine . +docker run --rm --privileged criu-openj9-alpine-test:latest +if [ $? -ne 0 ]; then + failures=`echo "$failures alpine"` +fi + +if [ -n "$failures" ]; then + echo "Tests failed on $failures" + exit 1 +fi diff --git a/scripts/travis/podman-test.sh b/scripts/travis/podman-test.sh index 9bd1f3d8bc..7490d5fe9a 100755 --- a/scripts/travis/podman-test.sh +++ b/scripts/travis/podman-test.sh @@ -1,7 +1,13 @@ #!/bin/bash set -x -e -o pipefail -add-apt-repository -y ppa:projectatomic/ppa +echo 'deb http://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/xUbuntu_18.04/ /' > /etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list + +wget -nv https://download.opensuse.org/repositories/devel:kubic:libcontainers:stable/xUbuntu_18.04/Release.key -O- | apt-key add - + +# podman conflicts with a man page from docker-ce +# this is a podman packaging bug (https://github.com/containers/libpod/issues/4747) +apt-get -y purge docker-ce apt-get install -qq \ apt-transport-https \ @@ -10,8 +16,7 @@ apt-get install -qq \ software-properties-common apt-get update -qq - -apt-get install -qqy podman +apt-get install -qqy podman containernetworking-plugins export SKIP_TRAVIS_TEST=1 @@ -21,41 +26,43 @@ cd ../../ make install -podman info +# overlaysfs behaves differently on Ubuntu and breaks CRIU +# https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1857257 +podman --storage-driver vfs info criu --version podman run --name cr -d docker.io/library/alpine /bin/sh -c 'i=0; while true; do echo $i; i=$(expr $i + 1); sleep 1; done' sleep 1 -for i in `seq 50`; do +for i in `seq 20`; do echo "Test $i for podman container checkpoint" podman exec cr ps axf podman logs cr - [ `podman ps -f name=cr -q | wc -l` -eq "1" ] + [ `podman ps -f name=cr -q -f status=running | wc -l` -eq "1" ] podman container checkpoint cr - [ `podman ps -f name=cr -q | wc -l` -eq "0" ] + [ `podman ps -f name=cr -q -f status=running | wc -l` -eq "0" ] podman ps -a podman container restore cr - [ `podman ps -f name=cr -q | wc -l` -eq "1" ] + [ `podman ps -f name=cr -q -f status=running | wc -l` -eq "1" ] podman logs cr done -for i in `seq 50`; do +for i in `seq 20`; do echo "Test $i for podman container checkpoint --export" podman ps -a podman exec cr ps axf podman logs cr - [ `podman ps -f name=cr -q | wc -l` -eq "1" ] + [ `podman ps -f name=cr -q -f status=running | wc -l` -eq "1" ] podman container checkpoint -l --export /tmp/chkpt.tar.gz - [ `podman ps -f name=cr -q | wc -l` -eq "0" ] + [ `podman ps -f name=cr -q -f status=running | wc -l` -eq "0" ] podman ps -a podman rm -fa podman ps -a podman container restore --import /tmp/chkpt.tar.gz - [ `podman ps -f name=cr -q | wc -l` -eq "1" ] + [ `podman ps -f name=cr -q -f status=running | wc -l` -eq "1" ] podman container restore --name cr2 --import /tmp/chkpt.tar.gz - [ `podman ps -f name=cr2 -q | wc -l` -eq "1" ] + [ `podman ps -f name=cr2 -q -f status=running | wc -l` -eq "1" ] podman ps -a podman logs cr podman logs cr2 @@ -63,7 +70,7 @@ for i in `seq 50`; do podman rm -fa podman ps -a podman container restore --import /tmp/chkpt.tar.gz - [ `podman ps -f name=cr -q | wc -l` -eq "1" ] + [ `podman ps -f name=cr -q -f status=running | wc -l` -eq "1" ] podman ps -a rm -f /tmp/chkpt.tar.gz done diff --git a/scripts/travis/travis-tests b/scripts/travis/travis-tests index 980d747348..4cb842c973 100755 --- a/scripts/travis/travis-tests +++ b/scripts/travis/travis-tests @@ -1,17 +1,31 @@ #!/bin/sh set -x -e -TRAVIS_PKGS="protobuf-c-compiler libprotobuf-c-dev libaio-dev +TRAVIS_PKGS="protobuf-c-compiler libprotobuf-c-dev libaio-dev python-future libgnutls28-dev libgnutls30 libprotobuf-dev protobuf-compiler - libcap-dev libnl-3-dev gcc-multilib gdb bash python-protobuf - libnet-dev util-linux asciidoctor libnl-route-3-dev" + libcap-dev libnl-3-dev gdb bash python-protobuf python-yaml + libnet-dev util-linux asciidoctor libnl-route-3-dev + python-junit.xml python-ipaddress time ccache flake8 + libbsd-dev" + +X86_64_PKGS="gcc-multilib" + +UNAME_M=`uname -m` + +if [ "$UNAME_M" != "x86_64" ]; then + # For Travis only x86_64 seems to be baremetal. Other + # architectures are running in unprivileged LXD containers. + # That seems to block most of CRIU's interfaces. + SKIP_TRAVIS_TEST=1 +fi travis_prep () { [ -n "$SKIP_TRAVIS_PREP" ] && return cd ../../ - service apport stop + # This can fail on aarch64 travis + service apport stop || : CC=gcc # clang support @@ -37,29 +51,39 @@ travis_prep () { CC="ccache $CC" fi - # The /etc/apt/sources.list in the current trusty image for ppc64le is - # broken and needs to be fixed - if [ "$TR_ARCH" = "ppc64le" ] ; then - sed -i '/security/ d' /etc/apt/sources.list + # Do not install x86_64 specific packages on other architectures + if [ "$UNAME_M" = "x86_64" ]; then + TRAVIS_PKGS="$TRAVIS_PKGS $X86_64_PKGS" fi apt-get update -qq apt-get install -qq --no-install-recommends $TRAVIS_PKGS - # travis is based on 14.04 and that does not have python - # packages for future and ipaddress (16.04 has those packages) - pip install junit-xml future ipaddress chmod a+x $HOME } travis_prep -ulimit -c unlimited -echo "|`pwd`/test/abrt.sh %P %p %s %e" > /proc/sys/kernel/core_pattern - export GCOV +$CC --version time make CC="$CC" -j4 -[ -n "$SKIP_TRAVIS_TEST" ] && return +./criu/criu -v4 cpuinfo dump || : +./criu/criu -v4 cpuinfo check || : + +make lint + +# Check that help output fits into 80 columns +WIDTH=$(./criu/criu --help | wc --max-line-length) +if [ "$WIDTH" -gt 80 ]; then + echo "criu --help output does not obey 80 characters line width!" + exit 1 +fi + +[ -n "$SKIP_TRAVIS_TEST" ] && exit 0 + +ulimit -c unlimited + +echo "|`pwd`/test/abrt.sh %P %p %s %e" > /proc/sys/kernel/core_pattern if [ "${COMPAT_TEST}x" = "yx" ] ; then # Dirty hack to keep both ia32 & x86_64 shared libs on a machine: @@ -121,11 +145,19 @@ else fi LAZY_EXCLUDE="$LAZY_EXCLUDE -x maps04" +# Starting with 5.4 kernel requires SYS_CAP_PTRACE to use uffd events; as such +# we cannot run lazy-pages tests in uns +LAZY_FLAVORS="" +if [ $KERN_MAJ -ge "5" ] && [ $KERN_MIN -ge "4" ]; then + LAZY_FLAVORS = "-f h,ns" +fi + LAZY_TESTS=.*\(maps0\|uffd-events\|lazy-thp\|futex\|fork\).* +LAZY_OPTS="-p 2 -T $LAZY_TESTS $LAZY_EXCLUDE $LAZY_FLAVORS $ZDTM_OPTS" -./test/zdtm.py run -p 2 -T $LAZY_TESTS --lazy-pages $LAZY_EXCLUDE $ZDTM_OPTS -./test/zdtm.py run -p 2 -T $LAZY_TESTS --remote-lazy-pages $LAZY_EXCLUDE $ZDTM_OPTS -./test/zdtm.py run -p 2 -T $LAZY_TESTS --remote-lazy-pages --tls $LAZY_EXCLUDE $ZDTM_OPTS +./test/zdtm.py run $LAZY_OPTS --lazy-pages +./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages +./test/zdtm.py run $LAZY_OPTS --remote-lazy-pages --tls bash ./test/jenkins/criu-fault.sh bash ./test/jenkins/criu-fcg.sh @@ -161,16 +193,7 @@ ip net add test ./test/zdtm.py run -t zdtm/static/env00 -k always ./test/crit-recode.py -make -C test/others/shell-job +# libcriu testing +make -C test/others/libcriu run -if ! [ -x "$(command -v flake8)" ]; then - pip install flake8 -fi -make lint - -# Check that help output fits into 80 columns -WIDTH=$(./criu/criu --help | wc --max-line-length) -if [ "$WIDTH" -gt 80 ]; then - echo "criu --help output does not obey 80 characters line width!" - exit 1 -fi +make -C test/others/shell-job diff --git a/soccr/test/tcp-conn.c b/soccr/test/tcp-conn.c index 1a1a5bb395..e31f58e7ec 100644 --- a/soccr/test/tcp-conn.c +++ b/soccr/test/tcp-conn.c @@ -23,7 +23,7 @@ static void pr_printf(unsigned int level, const char *fmt, ...) va_end(args); } -int main() +int main(void) { union libsoccr_addr addr, dst; int srv, sock, clnt, rst; diff --git a/soccr/test/tcp-constructor.c b/soccr/test/tcp-constructor.c index 89f2010001..973dbf10c8 100644 --- a/soccr/test/tcp-constructor.c +++ b/soccr/test/tcp-constructor.c @@ -20,7 +20,7 @@ struct tcp { uint16_t wscale; }; -static void usage() +static void usage(void) { printf( "Usage: --addr ADDR -port PORT --seq SEQ --next --addr ADDR -port PORT --seq SEQ -- CMD ...\n" diff --git a/test/crit-recode.py b/test/crit-recode.py index a7dcc72729..adaf337336 100755 --- a/test/crit-recode.py +++ b/test/crit-recode.py @@ -47,6 +47,8 @@ def recode_and_check(imgf, o_img, pretty): continue if imgf_b.startswith(b'ip6tables-'): continue + if imgf_b.startswith(b'nftables-'): + continue if imgf_b.startswith(b'route-'): continue if imgf_b.startswith(b'route6-'): diff --git a/test/inhfd/memfd.py b/test/inhfd/memfd.py new file mode 100755 index 0000000000..b06e35068f --- /dev/null +++ b/test/inhfd/memfd.py @@ -0,0 +1,28 @@ +import ctypes +import os +libc = ctypes.CDLL(None) + + +def memfd_create(name, flags): + return libc.memfd_create(name.encode('utf8'), flags) + + +def create_fds(): + def create_memfd_pair(name): + fd = memfd_create(name, 0) + fw = open('/proc/self/fd/{}'.format(fd), 'wb') + fr = open('/proc/self/fd/{}'.format(fd), 'rb') + os.close(fd) + return (fw, fr) + + return [create_memfd_pair("name{}".format(i)) for i in range(10)] + + +def filename(f): + name = os.readlink('/proc/self/fd/{}'.format(f.fileno())) + name = name.replace(' (deleted)', '') + return name + + +def dump_opts(sockf): + return [] diff --git a/test/inhfd/memfd.py.checkskip b/test/inhfd/memfd.py.checkskip new file mode 100755 index 0000000000..252778969d --- /dev/null +++ b/test/inhfd/memfd.py.checkskip @@ -0,0 +1,7 @@ +#!/usr/bin/env python + +import ctypes +libc = ctypes.CDLL(None) + +# libc may not have memfd_create (e.g., centos on travis) +libc.memfd_create("test".encode('utf8'), 0) diff --git a/test/inhfd/memfd.py.desc b/test/inhfd/memfd.py.desc new file mode 100644 index 0000000000..10666c8232 --- /dev/null +++ b/test/inhfd/memfd.py.desc @@ -0,0 +1 @@ +{ 'flavor': 'h' } diff --git a/test/inhfd/socket.py b/test/inhfd/socket.py index 9cea16ffb9..7efe7faab5 100755 --- a/test/inhfd/socket.py +++ b/test/inhfd/socket.py @@ -1,5 +1,5 @@ -import socket import os +import socket def create_fds(): diff --git a/test/javaTests/README.md b/test/javaTests/README.md new file mode 100644 index 0000000000..4315b9b124 --- /dev/null +++ b/test/javaTests/README.md @@ -0,0 +1,50 @@ +# JavaTests + +Java Functional tests checks the Java File based APIs and Memory mapping APIs by placing the process in various states before checkpointing and validates if these resources are still accessible after restore. It also validates if the file contents are in expected states. + +Tests are to be run by a user having following capabilities: +CAP_DAC_OVERRIDE +CAP_CHOWN +CAP_SETPCAP +CAP_SETGID +CAP_AUDIT_CONTROL +CAP_DAC_READ_SEARCH +CAP_NET_ADMIN +CAP_SYS_ADMIN +CAP_SYS_CHROOT +CAP_SYS_PTRACE +CAP_FOWNER +CAP_KILL +CAP_FSETID +CAP_SYS_RESOURCE +CAP_SETUID + +## File-based Java APIs + +Here we test the File-Based Java APIs by checkpointing the application in the following scenarios and verifying the contents of the file after restore: +- Reading and writing in the same file. (FileRead.java) +- Read from a file and write its content to another file. (ReadWrite.java) +- Reading from multiple files and writing their content to another file. (MultipleFileRead) +- Reading from a file and writing its content to multiple files. (MultipleFileWrite) + +## Memory mapping Java APIs + +Here we test the Memory Mapping APIs by checkpointing the application in following scenario and verifying the contents after restore: +- Memory-mapping a file and writing its content to another file. (MemoryMappings.java) + +## Socket-based Java APIs + +Here we test the Socket-based API's by checkpointing the application in the following scenario and verifying the state after restore: +- Checkpointing the server process in the middle of data transfer. (Sockets.java) +- Checkpointing the server process after it has bound to a port but is not listening for client connections. (SocketListen.java) +- Checkpointing the server process while it is listening for client connections, and no client has connected yet. (SocketConnect.java) +- Checkpointing the server process when it has multiple clients in multiple states connected to it. (SocketMultiple.java) +- Checkpointing the client process in the middle of data transfer. (SocketsData.java) + +### Prerequisites for running the tests: +- Maven + +### To run the tests: +- In the javaTests folder run the command ```sudo mvn test``` +- To keep the img files and logs from previous failures, between different runs of the test, use the ```-DneverCleanFailures=true ``` option in the maven command +as ```sudo mvn -DneverCleanFailures=true test``` diff --git a/test/javaTests/pom.xml b/test/javaTests/pom.xml new file mode 100644 index 0000000000..faae44d1bf --- /dev/null +++ b/test/javaTests/pom.xml @@ -0,0 +1,47 @@ + + 4.0.0 + criu + criu-javaTests + 1 + criu-javaTests + + + src + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.14.1 + + + + test.xml + + + + + + maven-compiler-plugin + 3.1 + + 1.7 + 1.7 + + + + + + + + org.testng + testng + 6.3.1 + + + + UTF-8 + + diff --git a/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java b/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java new file mode 100644 index 0000000000..9d61e126f1 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/CheckpointRestore.java @@ -0,0 +1,451 @@ +package org.criu.java.tests; + +import org.testng.Assert; +import org.testng.annotations.AfterTest; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.Parameters; +import org.testng.annotations.Test; + +import java.io.*; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardOpenOption; +import java.text.SimpleDateFormat; +import java.util.Date; + +public class CheckpointRestore { + private MappedByteBuffer mappedByteBuffer = null; + private String testName = ""; + private String logFolder = Helper.LOG_FOLDER + "/"; + private String outputFolder = Helper.OUTPUT_FOLDER_NAME + "/"; + + /** + * Create CRlog and output directory if they don't exist. + * Delete directories containing .img files from failed Checkpoint-Restore if 'neverCleanFailures' property is not set to true. + * + * @throws IOException + */ + @BeforeSuite + void suiteSetup() throws IOException { + System.out.println("Tests are to be run as a privileged user having capabilities mentioned in ReadMe"); + boolean neverCleanFailures = Boolean.getBoolean("neverCleanFailures"); + Path logDir = Paths.get(logFolder); + Path outputDir = Paths.get(outputFolder); + if (!Files.exists(logDir)) { + System.out.println("Logs directory does not exist, creating it"); + Files.createDirectory(logDir); + } + if (!Files.exists(outputDir)) { + System.out.println("Output directory does not exist, creating it"); + Files.createDirectory(outputDir); + } + /* + * Delete the directories containing the img files from failed Checkpoint-Restore. + */ + if (!neverCleanFailures) { + File output = new File(outputFolder); + String[] name = output.list(); + for (int i = 0; null != name && i < name.length; i++) { + File testFolder = new File(outputFolder + name[i]); + if (testFolder.isDirectory()) { + String[] list = testFolder.list(); + File file; + if (null != list) { + for (int j = 0; j < list.length; j++) { + file = new File(outputFolder + name[i] + "/" + list[j]); + if (!file.isDirectory()) { + Files.delete(file.toPath()); + } + } + } + } + Files.delete(testFolder.toPath()); + } + } + } + + /** + * Create the output folder for the test in case it does not exist + * + * @param testName Name of the java test + * @throws IOException + */ + private void testSetup(String testName) throws IOException { + Path testFolderPath = Paths.get(outputFolder + testName + "/"); + if (!Files.exists(testFolderPath)) { + System.out.println("Creating the test folder"); + Files.createDirectory(testFolderPath); + } + } + + /** + * Read the pid of process from the pid file of test + * + * @param name Name of the java test + * @return pid Process id of the java test process + * @throws IOException + */ + private String getPid(String name) throws IOException { + name = outputFolder + testName + "/" + name + Helper.PID_APPEND; + File pidfile = new File(name); + BufferedReader pidReader = new BufferedReader(new FileReader(pidfile)); + String pid = pidReader.readLine(); + pidReader.close(); + return pid; + } + + /** + * @param testName Name of the java test + * @param checkpointOpt Additional options for checkpoint + * @param restoreOpt Additional options for restore + * @throws Exception + */ + @Test + @Parameters({"testname", "checkpointOpt", "restoreOpt"}) + public void runtest(String testName, String checkpointOpt, String restoreOpt) throws Exception { + this.testName = testName; + String name = Helper.PACKAGE_NAME + "." + testName; + String pid; + int exitCode; + + System.out.println("======= Testing " + testName + " ========"); + + testSetup(testName); + + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + if (f.exists()) { + f.delete(); + } + + /* + * Create a new file that will be mapped to memory and used to communicate between + * this process and the java test process. + */ + boolean newFile = f.createNewFile(); + Assert.assertTrue(newFile, "Unable to create a new file to be mapped"); + + /* + * MappedByteBuffer communicates between this process and java process called. + */ + FileChannel channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + mappedByteBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + mappedByteBuffer.clear(); + channel.close(); + + /* + * Put MappedByteBuffer in Init state + */ + mappedByteBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_INIT); + + /* + * Run the test as a separate process + */ + System.out.println("Starting the java Test"); + ProcessBuilder builder = new ProcessBuilder("java", "-cp", "target/classes", name); + Process process = builder.start(); + + char currentState = mappedByteBuffer.getChar(Helper.MAPPED_INDEX); + /* + * Loop until the test process changes the state of MappedByteBuffer from init state + */ + while (Helper.STATE_INIT == currentState) { + currentState = mappedByteBuffer.getChar(Helper.MAPPED_INDEX); + Thread.sleep(100); + } + + /* + * If Mapped Buffer is in Helper.STATE_FAIL state before checkpointing then an exception must + * have occurred in the test. + */ + while (Helper.STATE_FAIL == currentState) { + try { + /* + * We exit the test process with exit code 5 in case of an exception + */ + exitCode = process.exitValue(); + /* + * Reaching here implies that .exitValue() has not thrown an exception, so the process has + * exited, We now check the exitCode. + */ + if (5 == exitCode) { + Assert.fail(testName + ": Exception occurred while running the test: check the log file for details."); + } else { + Assert.fail(testName + ": ERROR: Unexpected value of exit code: " + exitCode + ", expected: 5"); + } + } catch (IllegalThreadStateException e) { + /* + * Do nothing, as an Exception is expected if the process has not exited + * and we try to get its exitValue. + */ + } + + currentState = mappedByteBuffer.getChar(Helper.MAPPED_INDEX); + } + + /* + * Mapped Buffer state should be Helper.STATE_CHECKPOINT for checkpointing or Helper.STATE_END if some error occurs in test + */ + if (Helper.STATE_END != currentState) { + Assert.assertEquals(currentState, Helper.STATE_CHECKPOINT, testName + ": ERROR: Error occurred while running the test: test is not in the excepted 'waiting to be checkpointed state': " + currentState); + } else { + Assert.fail(testName + ": ERROR: Error took place in the test check the log file for more details"); + } + /* + * Reaching here implies that MappedByteBuffer is in To Be Checkpointed state. + * Get the pid of the test process + */ + + pid = getPid(testName); + try { + /* + * Checkpoint the process + */ + checkpoint(pid, checkpointOpt); + + } catch (Exception e) { + /* + * If exception occurs put the MappedByteBuffer to Helper.STATE_TERMINATE-Terminate state. + * On reading the terminate state, the test process terminates, else it + * may go on looping. + */ + mappedByteBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_TERMINATE); + Assert.fail(testName + ": Exception occurred while during checkpointing" + e, e); + } + + /* + * The process has been checkpointed successfully, now restoring the process. + */ + try { + /* + * Restore the process + */ + restore(restoreOpt); + } catch (Exception e) { + mappedByteBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_TERMINATE); + Assert.fail(testName + ": Exception occurred while restoring the test" + e, e); + } + + /* + * Wait for test process to finish + */ + currentState = mappedByteBuffer.getChar(Helper.MAPPED_INDEX); + while (Helper.STATE_RESTORE == currentState) { + currentState = mappedByteBuffer.getChar(Helper.MAPPED_INDEX); + } + + /* + * If a test passes it puts the MappedByteBuffer to Helper.STATE_PASS-Pass state, + * On failing to Helper.STATE_FAIL-Fail state, and if our Buffer is in Helper.STATE_TERMINATE state + * its because the checkpoint-restore of test process failed. + */ + + Assert.assertNotEquals(currentState, Helper.STATE_TERMINATE, testName + ": ERROR: Checkpoint-Restore failed"); + Assert.assertNotEquals(currentState, Helper.STATE_FAIL, testName + ": ERROR: Test Failed, Check Log for details"); + Assert.assertEquals(currentState, Helper.STATE_PASS, testName + " ERROR: Unexpected State of Mapped Buffer"); + System.out.println("----- " + "PASS" + " -----"); + + } + + /** + * Remove .img files, dump.log, restore.log, stats-dump and stats-restore files from Log Directory + * + * @throws IOException + */ + @AfterTest + void cleanup() throws IOException { + int i; + String currentPath = System.getProperty("user.dir"); + currentPath = currentPath + "/" + logFolder; + File deleteFile; + File dir = new File(currentPath); + String[] imgFiles = dir.list(new ImgFilter()); + if (null != imgFiles) { + for (i = 0; i < imgFiles.length; i++) { + deleteFile = new File(currentPath + imgFiles[i]); + Files.delete(deleteFile.toPath()); + } + } + + boolean exists = Files.exists(Paths.get(currentPath + "dump.log")); + if (exists) { + Files.delete(Paths.get(currentPath + "dump.log")); + } + + exists = Files.exists(Paths.get(currentPath + "restore.log")); + if (exists) { + Files.delete(Paths.get(currentPath + "restore.log")); + } + + exists = Files.exists(Paths.get(currentPath + "stats-dump")); + if (exists) { + Files.delete(Paths.get(currentPath + "stats-dump")); + } + + exists = Files.exists(Paths.get(currentPath + "stats-restore")); + if (exists) { + Files.delete(Paths.get(currentPath + "stats-restore")); + } + } + + /** + * Copy .img files, dump.log, restore.log, stats-dump and stats-restore files from Log Directory if they exist + * to another folder. + * + * @throws IOException + */ + String copyFiles() throws IOException { + String currentPath = System.getProperty("user.dir"); + String folderSuffix = new SimpleDateFormat("yyMMddHHmmss").format(new Date()); + String fromPath = currentPath + "/" + logFolder; + File fromDir = new File(fromPath); + Path fromFile, toFile; + boolean exists; + String toPath = currentPath + "/" + outputFolder + testName + folderSuffix + "/"; + Path dirPath = Paths.get(toPath); + Files.createDirectory(dirPath); + + String[] imgFiles = fromDir.list(new ImgFilter()); + if (null != imgFiles) { + for (int i = 0; i < imgFiles.length; i++) { + fromFile = Paths.get(fromPath + imgFiles[i]); + toFile = Paths.get(toPath + imgFiles[i]); + Files.copy(fromFile, toFile); + } + } + + fromFile = Paths.get(fromPath + "dump.log"); + exists = Files.exists(fromFile); + if (exists) { + toFile = Paths.get(toPath + "dump.log"); + Files.copy(fromFile, toFile); + } + + fromFile = Paths.get(fromPath + "restore.log"); + exists = Files.exists(fromFile); + if (exists) { + toFile = Paths.get(toPath + "restore.log"); + Files.copy(fromFile, toFile); + } + + fromFile = Paths.get(fromPath + "stats-dump"); + exists = Files.exists(fromFile); + if (exists) { + toFile = Paths.get(toPath + "stats-dump"); + Files.copy(fromFile, toFile); + } + + fromFile = Paths.get(fromPath + "stats-restore"); + exists = Files.exists(fromFile); + if (exists) { + toFile = Paths.get(toPath + "stats-restore"); + Files.copy(fromFile, toFile); + } + + return folderSuffix; + } + + /** + * Checkpoint the process, if process has not been checkpointed correctly + * copy the .img, log and stats files, puts MappedBuffer to 'terminate' state and mark + * test as failed + * + * @param pid Pid of process to be checkpointed + * @param checkpointOpt Additional options for checkpoint + * @throws IOException + * @throws InterruptedException + */ + private void checkpoint(String pid, String checkpointOpt) throws IOException, InterruptedException { + ProcessBuilder builder; + System.out.println("Checkpointing process " + pid); + String command = "../../criu/criu dump --shell-job -t " + pid + " -vvv -D " + logFolder + " -o dump.log"; + if (0 == checkpointOpt.length()) { + String[] cmd = command.split(" "); + builder = new ProcessBuilder(cmd); + } else { + command = command + " " + checkpointOpt; + String[] cmd = command.split(" "); + builder = new ProcessBuilder(cmd); + } + Process process = builder.start(); + BufferedReader stdError = new BufferedReader(new InputStreamReader(process.getErrorStream())); + int exitCode = process.waitFor(); + + if (0 != exitCode) { + /* + * Print the error stream + */ + String line = stdError.readLine(); + while (null != line) { + System.out.println(line); + line = stdError.readLine(); + } + + mappedByteBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_TERMINATE); + /* + * If checkpoint fails copy the img files, dump.log, stats-dump, stats-restore + */ + String folderSuffix = copyFiles(); + + Assert.fail(testName + ": ERROR: Error during checkpoint: exitCode of checkpoint process was not zero.\nFor more details check dump.log in " + outputFolder + testName + folderSuffix); + return; + } + + System.out.println("Checkpoint success"); + process.destroy(); + + } + + /** + * Restore the process, if process has been restored correctly put Mapped Buffer to + * 'restored' state, else copy the .img, log and stats files and put MappedBuffer to 'terminate' + * state and mark test as failed + * + * @param restoreOpt Additional options for restore + * @throws IOException + * @throws InterruptedException + */ + private void restore(String restoreOpt) throws IOException, InterruptedException { + ProcessBuilder builder; + System.out.println("Restoring process"); + String command = "../../criu/criu restore -d -vvv --shell-job -D " + logFolder + " -o restore.log"; + if (0 == restoreOpt.length()) { + String[] cmd = command.split(" "); + builder = new ProcessBuilder(cmd); + } else { + command = command + " " + restoreOpt; + String[] cmd = command.split(" "); + builder = new ProcessBuilder(cmd); + } + + Process process = builder.start(); + BufferedReader stdError = new BufferedReader(new InputStreamReader(process.getErrorStream())); + int exitCode = process.waitFor(); + + if (0 != exitCode) { + /* + * Print the error stream + */ + String line = stdError.readLine(); + while (null != line) { + System.out.println(line); + line = stdError.readLine(); + } + mappedByteBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_TERMINATE); + /* + * If restore fails copy img files, dump.log, restore.log, stats-dump, stats-restore + */ + String folderSuffix = copyFiles(); + Assert.fail(testName + ": ERROR: Error during restore: exitCode of restore process was not zero.\nFor more details check restore.log in " + outputFolder + testName + folderSuffix); + + return; + } else { + System.out.println("Restore success"); + mappedByteBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_RESTORE); + } + process.destroy(); + } +} diff --git a/test/javaTests/src/org/criu/java/tests/FileRead.java b/test/javaTests/src/org/criu/java/tests/FileRead.java new file mode 100644 index 0000000000..d8851a73ed --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/FileRead.java @@ -0,0 +1,175 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class FileRead { + private static String TESTNAME = "FileRead"; + + /** + * @param i int value denoting the line number. + * @return The line as a string. + */ + private static String getLine(int i) { + return "Line No: " + i + "\n"; + } + + /** + * Write in a file, line by line, and read it, checkpoint and restore + * and then continue to read and write the file. + * + * @param args Not used + */ + public static void main(String[] args) { + MappedByteBuffer b = null; + Logger logger = null; + int wi, ri = 0; + try { + File file = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/FileRead_write.txt"); + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + String pid = bean.getName(); + int val = Helper.init(TESTNAME, pid, logger); + if (0 != val) { + logger.log(Level.SEVERE, "Helper.init returned a non-zero code."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + FileChannel channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + /* + * Mapped Byte Buffer should be in init state at the beginning of test + */ + if (Helper.STATE_INIT != b.getChar(Helper.MAPPED_INDEX)) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + logger.log(Level.INFO, "Checking existence of file to be read and written to."); + if (file.exists()) { + file.delete(); + } + boolean newFile = file.createNewFile(); + if (!newFile) { + logger.log(Level.SEVERE, "Cannot create a new file to read and write to."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + BufferedWriter brw = new BufferedWriter(new FileWriter(file)); + BufferedReader brr = new BufferedReader(new FileReader(file)); + + logger.log(Level.INFO, "Start writing the lines in file"); + + for (wi = 1; wi <= 5; wi++) { + brw.write(getLine(wi)); + } + + brw.flush(); + String s = "Line No: 0"; + int i; + + for (i = 0; i < 50; i++) { + brw.write(getLine(wi)); + brw.flush(); + wi++; + s = brr.readLine(); + ri = Integer.parseInt(s.replaceAll("[\\D]", "")); + } + + wi--; + logger.log(Level.INFO, "Going to checkpoint"); + + /* + * Checkpoint and wait for restore + */ + Helper.checkpointAndWait(b, logger); + logger.log(Level.INFO, "Test has been restored!"); + + brw.flush(); + + try { + s = brr.readLine(); + + } catch (Exception e) { + logger.log(Level.SEVERE, "Error: Buffered Reader is not reading file"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + if (null == s || s.isEmpty()) { + logger.log(Level.SEVERE, "Error: Error while reading lines after restore: Line read is null"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + int readLineNo = Integer.parseInt(s.replaceAll("[\\D]", "")); + if (ri + 1 != readLineNo) { + logger.log(Level.SEVERE, "Error: Not reading at correct line"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + String ch = brr.readLine(); + while (null != ch && !ch.isEmpty()) { + s = ch; + ch = brr.readLine(); + } + + readLineNo = Integer.parseInt(s.replaceAll("[\\D]", "")); + + if (readLineNo != wi) { + logger.log(Level.SEVERE, "Error: Data written has been lost"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + try { + brw.write(getLine(wi + 1)); + brw.flush(); + } catch (IOException e) { + logger.log(Level.SEVERE, "Error: cannot write file after restore"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + s = brr.readLine(); + readLineNo = Integer.parseInt(s.replaceAll("[\\D]", "")); + + if (readLineNo != wi + 1) { + logger.log(Level.SEVERE, "Error: Data not written correctly"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "File is being read and written to correctly after restore!"); + logger.log(Level.INFO, Helper.PASS_MESSAGE); + brw.close(); + brr.close(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + System.exit(0); + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.FINE, writer.toString()); + } + + if (null != b) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/Helper.java b/test/javaTests/src/org/criu/java/tests/Helper.java new file mode 100644 index 0000000000..9a1b333286 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/Helper.java @@ -0,0 +1,130 @@ +package org.criu.java.tests; + +import java.io.*; +import java.nio.MappedByteBuffer; +import java.util.logging.FileHandler; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.logging.SimpleFormatter; + +class Helper { + static String MEMORY_MAPPED_FILE_NAME = "output/file"; + static String PASS_MESSAGE = "Test was a Success!!!"; + static String OUTPUT_FOLDER_NAME = "output"; + static String PACKAGE_NAME = "org.criu.java.tests"; + static String PID_APPEND = ".pid"; + static String SOURCE_FOLDER = "src/org/criu/java/tests"; + static String LOG_FOLDER = "CRlogs"; + static int MAPPED_REGION_SIZE = 100; + static int MAPPED_INDEX = 1; + static char STATE_RESTORE = 'R'; + static char STATE_CHECKPOINT = 'C'; + static char STATE_INIT = 'I'; + static char STATE_TERMINATE = 'T'; + static char STATE_END = 'E'; + static char STATE_FAIL = 'F'; + static char STATE_PASS = 'P'; + + /** + * Create a new log file and pidfile and write + * the pid to the pidFile. + * + * @param testName Name of the java test + * @param pid Pid of the java test process + * @param logger + * @return 0 or 1 denoting whether the function was successful or not. + * @throws IOException + */ + static int init(String testName, String pid, Logger logger) throws IOException { + File pidfile = new File(OUTPUT_FOLDER_NAME + "/" + testName + "/" + testName + PID_APPEND); + + FileHandler handler = new FileHandler(Helper.OUTPUT_FOLDER_NAME + "/" + testName + "/" + testName + ".log", false); + handler.setFormatter(new SimpleFormatter()); + handler.setLevel(Level.FINE); + logger.addHandler(handler); + logger.setLevel(Level.FINE); + + /* + * Create a pid file and write the process's pid into it. + */ + if (pidfile.exists()) { + pidfile.delete(); + } + boolean newFile = pidfile.createNewFile(); + if (!newFile) { + logger.log(Level.SEVERE, "Cannot create new pid file."); + return 1; + } + BufferedWriter pidWriter = new BufferedWriter(new FileWriter(pidfile)); + pidWriter.write(pid + "\n"); + pidWriter.close(); + return 0; + } + + /** + * Put the Mapped Buffer to 'Ready to be checkpointed' state and wait for restore. + * + * @param b The MappedByteBuffer from the calling process. + * @param logger The Logger from the calling process. + */ + static void checkpointAndWait(MappedByteBuffer b, Logger logger) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); + char c = b.getChar(Helper.MAPPED_INDEX); + /* + * Loop while MappedByteBuffer is in 'To be checkpointed' state + */ + while (Helper.STATE_CHECKPOINT == c) { + c = b.getChar(Helper.MAPPED_INDEX); + } + /* + * Test is in 'T' state if some error or exception occurs during checkpoint or restore. + */ + if (Helper.STATE_TERMINATE == c) { + logger.log(Level.SEVERE, "Error during checkpoint-restore, Test terminated"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + /* + * The expected state of MappedByteBuffer is Helper.STATE_RESTORE-restored state. + */ + if (Helper.STATE_RESTORE != c) { + logger.log(Level.INFO, "Error: Test state is not the expected Restored state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + } + + + /** + * Compare two files and return true if their content is similar. + * + * @param readFile File 1 whose content has to be compared. + * @param writeFile File 2 whose content has to be compared. + * @return true if the files are similar, false otherwise. + * @throws IOException + */ + static boolean compare(File readFile, File writeFile) throws IOException { + BufferedReader bir = new BufferedReader(new FileReader(readFile)); + BufferedReader bor = new BufferedReader(new FileReader(writeFile)); + String si, so; + si = bir.readLine(); + so = bor.readLine(); + while (null != si && null != so) { + if (!si.equals(so)) { + return false; + } + + si = bir.readLine(); + so = bor.readLine(); + } + + if ((null == si) && (null == so)) { + return true; + } + bir.close(); + bor.close(); + + return false; + } + +} diff --git a/test/javaTests/src/org/criu/java/tests/ImgFilter.java b/test/javaTests/src/org/criu/java/tests/ImgFilter.java new file mode 100644 index 0000000000..97087c2ccd --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/ImgFilter.java @@ -0,0 +1,11 @@ +package org.criu.java.tests; + +import java.io.File; +import java.io.FilenameFilter; + +class ImgFilter implements FilenameFilter { + @Override + public boolean accept(File dir, String fileName) { + return (fileName.endsWith(".img")); + } +} diff --git a/test/javaTests/src/org/criu/java/tests/MemoryMappings.java b/test/javaTests/src/org/criu/java/tests/MemoryMappings.java new file mode 100644 index 0000000000..4ac6f4a17f --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/MemoryMappings.java @@ -0,0 +1,121 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class MemoryMappings { + private static String TESTNAME = "MemoryMappings"; + + /** + * Map a file to memory and write the mapped data into a file, + * checkpointing and restoring in between. + * + * @param args Not used + */ + public static void main(String[] args) { + MappedByteBuffer b = null; + Logger logger = null; + + try { + MappedByteBuffer testBuffer; + char ch; + int i = 1; + boolean similar; + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + File readFile = new File(Helper.SOURCE_FOLDER + "/" + "ReadWrite.java"); + File writeFile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/" + "MemoryMappings_file.txt"); + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + String pid = bean.getName(); + int val = Helper.init(TESTNAME, pid, logger); + if (0 != val) { + logger.log(Level.SEVERE, "Helper.init returned a non-zero code."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + + FileChannel channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + /* + * Mapped Byte Buffer should be in init state at the beginning of test + */ + if (Helper.STATE_INIT != b.getChar(Helper.MAPPED_INDEX)) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Checking existence of file to be memory mapped"); + if (!readFile.exists()) { + logger.log(Level.SEVERE, "Error: File from which to read does not exist"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + channel = FileChannel.open(readFile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + testBuffer = channel.map(MapMode.READ_WRITE, 0, readFile.length()); + channel.close(); + + if (writeFile.exists()) { + writeFile.delete(); + } + boolean newFile = writeFile.createNewFile(); + if (!newFile) { + logger.log(Level.SEVERE, "Error: Cannot create a new file to write to."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + BufferedWriter brw = new BufferedWriter(new FileWriter(writeFile)); + + while (testBuffer.hasRemaining()) { + ch = (char) testBuffer.get(); + brw.write(ch); + i++; + if (200 == i) { + logger.log(Level.INFO, "Going to checkpoint"); + Helper.checkpointAndWait(b, logger); + logger.log(Level.INFO, "Test has been restored!"); + } + } + + brw.close(); + logger.log(Level.INFO, "Comparing contents of the file"); + + similar = Helper.compare(readFile, writeFile); + if (!similar) { + logger.log(Level.SEVERE, "Error: Files are not similar after writing"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "Data was read and written correctly!"); + logger.log(Level.INFO, Helper.PASS_MESSAGE); + brw.close(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + System.exit(0); + + } catch (Exception e) { + if (null != logger) { + + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.FINE, writer.toString()); + } + + if (null != b) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/MultipleFileRead.java b/test/javaTests/src/org/criu/java/tests/MultipleFileRead.java new file mode 100644 index 0000000000..7b023673e0 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/MultipleFileRead.java @@ -0,0 +1,203 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class MultipleFileRead { + private static String TESTNAME = "MultipleFileRead"; + + /** + * @param readFile1 File 1 whose contents are read. + * @param readFile2 File 2 whose contents are read. + * @param writeFile File in which data has been written to. + * @return true if the data written is as expected, false otherwise. + * @throws IOException + */ + private static boolean compare(File readFile1, File readFile2, File writeFile) throws IOException { + BufferedReader br1 = new BufferedReader(new FileReader(readFile1)); + BufferedReader br2 = new BufferedReader(new FileReader(readFile2)); + BufferedReader brw = new BufferedReader(new FileReader(writeFile)); + boolean eof1, eof2; + eof1 = false; + eof2 = false; + String inpString, wrtString; + + while (!eof1 || !eof2) { + if (!eof1) { + inpString = br1.readLine(); + if (null == inpString) { + eof1 = true; + } else { + wrtString = brw.readLine(); + if (null == wrtString) { + return false; + } + if (!wrtString.equals(inpString)) { + return false; + } + } + } + if (!eof2) { + inpString = br2.readLine(); + if (null == inpString) { + eof2 = true; + } else { + wrtString = brw.readLine(); + if (null == wrtString) { + return false; + } + if (!wrtString.equals(inpString)) { + return false; + } + } + } + } + + wrtString = brw.readLine(); + if (null != wrtString) { + return false; + } + + br1.close(); + br2.close(); + brw.close(); + + return true; + } + + /** + * Read from multiple files and write their content into another file, + * checkpointing and restoring in between. + * + * @param args Not used. + */ + public static void main(String[] args) { + MappedByteBuffer b = null; + String s; + int i = 0; + Logger logger = null; + try { + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + File readFile1 = new File(Helper.SOURCE_FOLDER + "/" + "FileRead.java"); + File readFile2 = new File(Helper.SOURCE_FOLDER + "/" + "ReadWrite.java"); + File writeFile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/" + "MultipleFileRead_file.txt"); + boolean eofFile1 = false, eofFile2 = false, check; + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + String pid = bean.getName(); + int val = Helper.init(TESTNAME, pid, logger); + if (0 != val) { + logger.log(Level.SEVERE, "Helper.init returned a non-zero code."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + + FileChannel channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + /* + * Mapped Byte Buffer should be in init state at the beginning of test + */ + if (b.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Checking existence of the read files"); + + if (!readFile1.exists()) { + logger.log(Level.SEVERE, "Error: File from which to read does not exist"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (!readFile2.exists()) { + logger.log(Level.SEVERE, "Error: File from which to read does not exist"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (writeFile.exists()) { + writeFile.delete(); + } + logger.log(Level.INFO, "Creating writeFile"); + boolean newFile = writeFile.createNewFile(); + if (!newFile) { + logger.log(Level.SEVERE, "Error: Cannot create a new file to write to."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + BufferedReader br1 = new BufferedReader(new FileReader(readFile1)); + BufferedReader br2 = new BufferedReader(new FileReader(readFile2)); + BufferedWriter brw = new BufferedWriter(new FileWriter(writeFile)); + + logger.log(Level.INFO, "Writing in file"); + + while (!eofFile1 || !eofFile2) { + if (!eofFile1) { + s = br1.readLine(); + i++; + if (null == s) { + eofFile1 = true; + } else { + brw.write(s + "\n"); + } + } + if (!eofFile2) { + s = br2.readLine(); + i++; + if (null == s) { + eofFile2 = true; + } else { + brw.write(s + "\n"); + } + } + if (10 == i) { + /* + * Checkpoint and Restore + */ + logger.log(Level.INFO, "Going to checkpoint"); + Helper.checkpointAndWait(b, logger); + logger.log(Level.INFO, "Test has been restored!"); + } + } + brw.flush(); + logger.log(Level.INFO, "Checking the content of the file"); + check = compare(readFile1, readFile2, writeFile); + + if (!check) { + logger.log(Level.SEVERE, "Error: Files are not similar after writing"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "The file has been written as expected"); + logger.log(Level.INFO, Helper.PASS_MESSAGE); + br1.close(); + br2.close(); + brw.close(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + System.exit(0); + + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.FINE, writer.toString()); + } + + if (null != b) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/MultipleFileWrite.java b/test/javaTests/src/org/criu/java/tests/MultipleFileWrite.java new file mode 100644 index 0000000000..76d287a07c --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/MultipleFileWrite.java @@ -0,0 +1,140 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class MultipleFileWrite { + private static String TESTNAME = "MultipleFileWrite"; + + /** + * Reads from a file and write its content into multiple files, + * checkpointing and restoring in between. + * + * @param args Not used. + */ + public static void main(String[] args) { + MappedByteBuffer b = null; + String s, pid; + int i = 1; + Logger logger = null; + boolean similar1, similar2; + try { + File readFile = new File(Helper.SOURCE_FOLDER + "/" + "FileRead.java"); + File writeFile1 = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/" + TESTNAME + "1_file.txt"); + File writeFile2 = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/" + TESTNAME + "2_file.txt"); + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + pid = bean.getName(); + int val = Helper.init(TESTNAME, pid, logger); + if (0 != val) { + logger.log(Level.SEVERE, "Helper.init returned a non-zero code."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + + FileChannel channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + /* + * Mapped Byte Buffer should be in init state at the beginning of test + */ + if (Helper.STATE_INIT != b.getChar(Helper.MAPPED_INDEX)) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Checking existence of read files!"); + + if (!readFile.exists()) { + logger.log(Level.SEVERE, "Error: File from which to read does not exist"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (writeFile1.exists()) { + writeFile1.delete(); + } + boolean newFile = writeFile1.createNewFile(); + if (!newFile) { + logger.log(Level.SEVERE, "Error: Cannot create a new file to write to."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + if (writeFile2.exists()) { + writeFile2.delete(); + } + newFile = writeFile2.createNewFile(); + if (!newFile) { + logger.log(Level.SEVERE, "Error: Cannot create a new file to write to."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Created write files"); + + BufferedReader br = new BufferedReader(new FileReader(readFile)); + BufferedWriter bw1 = new BufferedWriter(new FileWriter(writeFile1)); + BufferedWriter bw2 = new BufferedWriter(new FileWriter(writeFile2)); + + s = br.readLine(); + + while (null != s) { + bw1.write(s + "\n"); + bw2.write(s + "\n"); + if (90 == i) { + /* + * Checkpoint and Restore + */ + logger.log(Level.INFO, "Going to checkpoint"); + Helper.checkpointAndWait(b, logger); + logger.log(Level.INFO, "Test has been restored!"); + } + + i++; + s = br.readLine(); + } + + bw1.flush(); + bw2.flush(); + logger.log(Level.INFO, "Checking files have been written correctly"); + + similar1 = Helper.compare(readFile, writeFile1); + similar2 = Helper.compare(readFile, writeFile2); + + if (!similar1 || !similar2) { + logger.log(Level.SEVERE, "Error: Written data is not identical to the data read"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "Content of files is as expected"); + logger.log(Level.INFO, Helper.PASS_MESSAGE); + br.close(); + bw1.close(); + bw2.close(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + System.exit(0); + + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.FINE, writer.toString()); + } + + if (null != b) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/ReadWrite.java b/test/javaTests/src/org/criu/java/tests/ReadWrite.java new file mode 100644 index 0000000000..fa98447ed7 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/ReadWrite.java @@ -0,0 +1,119 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class ReadWrite { + private static String TESTNAME = "ReadWrite"; + + /** + * Read from a file and write its content into another file, + * checkpointing and restoring in between. + * + * @param args Not used. + */ + public static void main(String[] args) { + int i = 0; + String s, pid; + boolean similar; + MappedByteBuffer b = null; + Logger logger = null; + try { + File readFile = new File(Helper.SOURCE_FOLDER + "/" + "FileRead.java"); + File writeFile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/" + "ReadWrite_file.txt"); + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + pid = bean.getName(); + int val = Helper.init(TESTNAME, pid, logger); + if (0 != val) { + logger.log(Level.SEVERE, "Helper.init returned a non-zero code."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + FileChannel channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + /* + * Mapped Byte Buffer should be in init state at the beginning of test + */ + if (Helper.STATE_INIT != b.getChar(Helper.MAPPED_INDEX)) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Checking existence of files to be read!"); + if (!readFile.exists()) { + logger.log(Level.SEVERE, "Error: File from which to read does not exist"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + if (writeFile.exists()) { + writeFile.delete(); + } + logger.log(Level.INFO, "Creating the writeFile"); + boolean newFile = writeFile.createNewFile(); + if (!newFile) { + logger.log(Level.SEVERE, "Error: Cannot create a new file to write to."); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + BufferedReader brr = new BufferedReader(new FileReader(readFile)); + BufferedWriter brw = new BufferedWriter(new FileWriter(writeFile)); + logger.log(Level.INFO, "Start writing"); + + s = brr.readLine(); + + while (null != s) { + i++; + brw.write(s + "\n"); + + if (50 == i) { + /* + * Checkpoint and Restore + */ + logger.log(Level.INFO, "Going to checkpoint"); + Helper.checkpointAndWait(b, logger); + logger.log(Level.INFO, "Test has been restored!"); + } + s = brr.readLine(); + } + + brw.flush(); + logger.log(Level.INFO, "Checking content of the files."); + similar = Helper.compare(readFile, writeFile); + + if (!similar) { + logger.log(Level.SEVERE, "Error: Files are not similar after writing"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "Content of file is as expected"); + logger.log(Level.INFO, Helper.PASS_MESSAGE); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + System.exit(0); + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + e); + logger.log(Level.FINE, writer.toString()); + } + if (null != b) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketHelper.java b/test/javaTests/src/org/criu/java/tests/SocketHelper.java new file mode 100644 index 0000000000..684125019d --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketHelper.java @@ -0,0 +1,100 @@ +package org.criu.java.tests; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.nio.MappedByteBuffer; +import java.util.logging.FileHandler; +import java.util.logging.Level; +import java.util.logging.Logger; +import java.util.logging.SimpleFormatter; + +class SocketHelper { + + static char STATE_LISTEN = 'S'; + static char STATE_SUCCESS = 'Z'; + static String IP_ADDRESS = "127.0.0.1"; + + /** + * Creates a new log file, for the logger to log in. + * + * @param testName Name of the server or client program + * @param parentTestName Name of the test + * @param logger + * @throws IOException + */ + static void init(String testName, String parentTestName, Logger logger) throws IOException { + FileHandler handler = new FileHandler(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/" + testName + ".log", false); + handler.setFormatter(new SimpleFormatter()); + handler.setLevel(Level.FINE); + logger.addHandler(handler); + logger.setLevel(Level.FINE); + } + + /** + * Writes pid of the process to be checkpointed in the file + * + * @param parentTestName Name of the test + * @param pid Pid of the process to be checkpointed + * @throws IOException + */ + static void writePid(String parentTestName, String pid) throws IOException { + File pidfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/" + parentTestName + Helper.PID_APPEND); + BufferedWriter pidwriter = new BufferedWriter(new FileWriter(pidfile)); + /* + * Overwriting pid to be checkpointed + */ + pidwriter.write(pid + "\n"); + pidwriter.close(); + } + + /** + * Waits for the MappedByteBuffer to change state from STATE_CHECKPOINT to STATE_RESTORE + * + * @param socketMappedBuffer MappedByteBuffer between the client, server and the controller process. + * @param logger + */ + static void socketWaitForRestore(MappedByteBuffer socketMappedBuffer, Logger logger) { + while (Helper.STATE_CHECKPOINT == socketMappedBuffer.getChar(Helper.MAPPED_INDEX)) { + ; + } + if (Helper.STATE_RESTORE != socketMappedBuffer.getChar(Helper.MAPPED_INDEX)) { + logger.log(Level.SEVERE, "Server socket was not in expected restore state " + socketMappedBuffer.getChar(Helper.MAPPED_INDEX)); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } else { + logger.log(Level.INFO, "Restored!!!"); + } + } + + /** + * Puts the MappedByteBuffer to Helper.STATE_CHECKPOINT and waits for CheckpointRestore.java to change its state to Helper.STATE_RESTORE + * + * @param b MappedByteBuffer between the controller process and CheckpointRestore.java + * @param logger Logger to log the messages + * @param p1 Process object for the client process + * @param p2 Process object for the server process + */ + static void checkpointAndWait(MappedByteBuffer b, Logger logger, Process p1, Process p2) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); + char c = b.getChar(Helper.MAPPED_INDEX); + while (Helper.STATE_CHECKPOINT == c) { + c = b.getChar(Helper.MAPPED_INDEX); + } + if (Helper.STATE_TERMINATE == c) { + logger.log(Level.SEVERE, "Error during checkpoint-restore, Test terminated"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + p1.destroy(); + p2.destroy(); + System.exit(1); + } + if (Helper.STATE_RESTORE != c) { + logger.log(Level.SEVERE, "Error: Test state is not the expected Restored state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + p1.destroy(); + p2.destroy(); + System.exit(1); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/Sockets.java b/test/javaTests/src/org/criu/java/tests/Sockets.java new file mode 100644 index 0000000000..94cc217c4a --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/Sockets.java @@ -0,0 +1,141 @@ +package org.criu.java.tests; + +import java.io.File; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class Sockets { + static String TESTNAME = "Sockets"; + + /** + * Runs the client and server process, checkpoints the server process while its in the middle of data transfer + * + * @param args Not used + */ + public static void main(String[] args) { + MappedByteBuffer b = null, socketMappedBuffer = null; + FileChannel channel; + String pid; + String port = "49200"; + Logger logger = null; + try { + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + + /* + * Mapped buffer 'b' to communicate between CheckpointRestore.java and this process. + */ + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + pid = bean.getName(); + Helper.init(TESTNAME, pid, logger); + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + + if (Helper.STATE_INIT != b.getChar(Helper.MAPPED_INDEX)) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Creating socketBufferFile and setting the init value of buffer"); + + /* + * Socket Mapped Buffer to communicate between server process, client process and this process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/SocketsFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_INIT); + + logger.log(Level.INFO, "Starting server and client process"); + ProcessBuilder builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsServer", TESTNAME, port); + Process serverProcess = builder.start(); + logger.log(Level.INFO, "Server process started"); + builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsClient", TESTNAME, port); + Process clientProcess = builder.start(); + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_LISTEN) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Killing the server and client process"); + logger.log(Level.SEVERE, "Error took place in the client or server process; check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_CHECKPOINT) { + logger.log(Level.SEVERE, "Killing the server and client process"); + logger.log(Level.SEVERE, "State is not the expected 'to be checkpointed' state"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_CHECKPOINT) { + logger.log(Level.INFO, "Going to checkpoint server process"); + SocketHelper.checkpointAndWait(b, logger, serverProcess, clientProcess); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_RESTORE); + logger.log(Level.INFO, "Process has been restored"); + } + /* + * Loop while test is running. + */ + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_FAIL && socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_PASS) { + logger.log(Level.SEVERE, "Killing the server and client process"); + logger.log(Level.SEVERE, "Received wrong message from the child process: not the expected finish message"); + logger.log(Level.SEVERE, "Check their log files for more details"); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { + logger.log(Level.SEVERE, "Killing the server and client process"); + logger.log(Level.SEVERE, "Error in the client or server process: check their log for details"); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + /* + * Client process puts socketMappedBuffer to Pass state if the test passed. + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_PASS) { + logger.log(Level.INFO, Helper.PASS_MESSAGE); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + System.exit(0); + + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + e); + logger.log(Level.FINE, writer.toString()); + } + if (b != null) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsClient.java b/test/javaTests/src/org/criu/java/tests/SocketsClient.java new file mode 100644 index 0000000000..1c8e7b9a18 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsClient.java @@ -0,0 +1,133 @@ +package org.criu.java.tests; + +import java.io.*; +import java.net.Socket; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsClient { + static String TESTNAME = "SocketsClient"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + Socket socket = null; + Logger logger = null; + String msg1 = "Ch@ckM@$$@Ge!1", msg2 = "cH@C!m$SG!!2", + readMssg, msg3 = "@Ft@rCPM$$g3", msg4 = "Aft@rCPM$$g4"; + String parentTestName, portArg; + int port; + + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT && socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != SocketHelper.STATE_LISTEN) { + logger.log(Level.SEVERE, "Error: Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + logger.log(Level.INFO, "Client socket sending req to server at IP: 127.0.0.1 port:" + port); + + /* + * Ensure client does not try to connect to port before server has bound itself. + */ + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT) { + ; + } + /* + * Socket Buffer should be put in SocketHelper.STATE_LISTEN state by server process, just before + * it starts listening for client connections. + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != SocketHelper.STATE_LISTEN) { + logger.log(Level.SEVERE, "Error: Buffer does not contain the expected 'server bound to port and listening' state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + /* + * Ensure server has bound to port + */ + try { + Thread.sleep(10); + } catch (InterruptedException e) { + logger.log(Level.WARNING, "InterruptedException occurred!"); + } + + socket = new Socket(SocketHelper.IP_ADDRESS, port); + + PrintStream out = new PrintStream(socket.getOutputStream()); + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + + logger.log(Level.INFO, "Sending message to server " + msg1); + out.println(msg1); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Message received from server " + readMssg); + if (!msg2.equals(readMssg)) { + logger.log(Level.SEVERE, "Error: wrong message received; message expected " + msg2); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + + logger.log(Level.INFO, "Sending message to server " + msg3); + out.println(msg3); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Message received from server " + readMssg); + if (!msg4.equals(readMssg)) { + logger.log(Level.SEVERE, "Error: wrong message received; message expected " + msg4); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + socket.close(); + /* + * Wait for server process to end and then check whether it ended successfully or not + * If it has finished properly the socketMappedBuffer will contain SocketHelper.STATE_SUCCESS + */ + logger.log(Level.INFO, "Waiting for server process to end...."); + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE) { + ; + } + /* + * Check the server process has ended successfully, if it was a success put Mapped Buffer to pass state, else to failed state + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + logger.log(Level.INFO, "Test ends"); + + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + exception); + logger.log(Level.FINE, writer.toString()); + } + + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsConnect.java b/test/javaTests/src/org/criu/java/tests/SocketsConnect.java new file mode 100644 index 0000000000..164c210896 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsConnect.java @@ -0,0 +1,157 @@ +package org.criu.java.tests; + +import java.io.File; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsConnect { + static String TESTNAME = "SocketsConnect"; + + /** + * Runs the client and server process, checkpoints the server when its listening for incoming client connection requests on a port but no client has connected yet + * + * @param args Not used + */ + public static void main(String[] args) { + MappedByteBuffer b = null, socketMappedBuffer = null; + FileChannel channel; + String pid; + String port = "49200"; + Logger logger = null; + try { + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + + /* + * Mapped buffer 'b' to communicate between CheckpointRestore.java and this process. + */ + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + pid = bean.getName(); + Helper.init(TESTNAME, pid, logger); + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + + if (b.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + /* + * Socket Mapped Buffer to communicate between server process, client process and this process. + */ + logger.log(Level.INFO, "Creating socketbufferfile and setting the init value of buffer"); + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/SocketsConnectFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + /* + * Set socketMappedBuffer to init state. + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_INIT); + logger.log(Level.INFO, "Starting server and client process"); + ProcessBuilder builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsConnectServer", TESTNAME, port); + Process serverProcess = builder.start(); + builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsConnectClient", TESTNAME, port); + Process clientProcess = builder.start(); + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "Some error took place in the client or server process: check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "Exception occured in the client or server process: check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_CHECKPOINT) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "State is not the expected 'to be checkpointed' state"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_CHECKPOINT) { + logger.log(Level.INFO, "Going to checkpoint server process"); + try { + Thread.sleep(10); + } catch (InterruptedException e) { + logger.log(Level.WARNING, "Thread was interrupted"); + } + SocketHelper.checkpointAndWait(b, logger, serverProcess, clientProcess); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_RESTORE); + logger.log(Level.INFO, "Process has been restored!"); + } + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_LISTEN) { + ; + } + char bufchar = socketMappedBuffer.getChar(Helper.MAPPED_INDEX); + if (bufchar != Helper.STATE_FAIL && bufchar != Helper.STATE_PASS && bufchar != SocketHelper.STATE_SUCCESS) { + logger.log(Level.SEVERE, "Received wrong message from the child process: not the expected finish message"); + logger.log(Level.SEVERE, "Check their log files for more details"); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Error in the client or server process: check their log for details"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + ; + } + + /* + * Client process puts socketMappedBuffer to 'P'-Pass state if the test passed. + * Send pass message to Checkpoint-restore.java + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_PASS) { + logger.log(Level.INFO, Helper.PASS_MESSAGE); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + System.exit(0); + + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + e); + logger.log(Level.FINE, writer.toString()); + } + if (b != null) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsConnectClient.java b/test/javaTests/src/org/criu/java/tests/SocketsConnectClient.java new file mode 100644 index 0000000000..ed1c7fab3c --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsConnectClient.java @@ -0,0 +1,130 @@ +package org.criu.java.tests; + +import java.io.*; +import java.net.Socket; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsConnectClient { + static String TESTNAME = "SocketsConnectClient"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + Socket socket = null; + String parentTestName, portArg; + int port; + Logger logger = null; + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + + String msg1 = "Ch@ckM@$$@Ge!1", msg2 = "cH@C!m$SG!!2", + readMssg, msg3 = "@Ft@rCPM$$g3", msg4 = "Aft@rCPM$$g4"; + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsConnectFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT && socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_CHECKPOINT && socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_RESTORE) { + logger.log(Level.SEVERE, "Error: Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Waiting for CR"); + /* + * Wait for Checkpoint-Restore to occur + */ + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_CHECKPOINT) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_RESTORE) { + logger.log(Level.SEVERE, "Error:Buffer does not contain the expected restored state: " + socketMappedBuffer.getChar(Helper.MAPPED_INDEX)); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + logger.log(Level.INFO, "Restored"); + logger.log(Level.INFO, "Client socket sending req to server at IP: 127.0.0.1 port:" + port); + + /* + * Server should has have been listening for client connections when it was checkpointed, and it should continue to listen after restore. + */ + try { + socket = new Socket(SocketHelper.IP_ADDRESS, port); + } catch (Exception e) { + logger.log(Level.SEVERE, "Exception occured when connecting to port: " + e); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + PrintStream out = new PrintStream(socket.getOutputStream()); + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + + logger.log(Level.INFO, "Sending message to server " + msg1); + out.println(msg1); + + readMssg = br.readLine(); + logger.log(Level.INFO, "message received from server " + readMssg); + if (!msg2.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received; Expected " + msg2); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "Sending message to server " + msg3); + out.println(msg3); + + readMssg = br.readLine(); + logger.log(Level.INFO, "message received from server " + readMssg); + if (!msg4.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received; Expected " + msg4); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + socket.close(); + + /* + * Wait for server process to end. + */ + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE) { + ; + } + /* + * Check the server process has ended successfully, if it was a success put Mapped Buffer to pass state, else to failed state + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + exception); + logger.log(Level.FINE, writer.toString()); + } + + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsConnectServer.java b/test/javaTests/src/org/criu/java/tests/SocketsConnectServer.java new file mode 100644 index 0000000000..1e4cf3aeb1 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsConnectServer.java @@ -0,0 +1,151 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.net.ServerSocket; +import java.net.Socket; +import java.net.SocketException; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsConnectServer { + static String TESTNAME = "SocketsConnectServer"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + Socket socket = null; + String msg1 = "Ch@ckM@$$@Ge!1", msg2 = "cH@C!m$SG!!2", msg3 = "@Ft@rCPM$$g3", + msg4 = "Aft@rCPM$$g4", readMssg; + Logger logger = null; + String parentTestName, portArg; + int port; + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsConnectFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + String pid = bean.getName(); + SocketHelper.writePid(parentTestName, pid); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + logger.log(Level.INFO, "Server pid: " + pid); + logger.log(Level.INFO, "socket buffer connection opened"); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + + ServerSocket ser = new ServerSocket(port); + logger.log(Level.INFO, "Server will be listening on Port: " + port); + + /* + * Timeout after 7 sec if client does not connect + */ + try { + ser.setSoTimeout(7 * 1000); + + } catch (SocketException e) { + logger.log(Level.SEVERE, "Cannot set timeout!"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + logger.log(Level.INFO, "Waiting for client to connect"); + logger.log(Level.INFO, "Going to checkpoint"); + + try { + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + ser.close(); + System.exit(1); + } + /* + * Checkpoint when server is listening for connections, and no client has connected to the server. + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); + socket = ser.accept(); + SocketHelper.socketWaitForRestore(socketMappedBuffer, logger); + + } catch (Exception e) { + logger.log(Level.SEVERE, "Timed out while waiting for client to connect\n" + e); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + if (!ser.isBound()) { + logger.log(Level.SEVERE, "Server is not bound to a port"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + if (ser.getLocalPort() != port) { + logger.log(Level.SEVERE, "Server is not listening on correct port"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + PrintStream outstream = new PrintStream(socket.getOutputStream()); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Read message 1: " + readMssg); + if (!msg1.equals(readMssg)) { + logger.log(Level.SEVERE, "Message 1 received was wrong,received: " + readMssg + " expected: " + msg1); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "Sending message: " + msg2); + outstream.println(msg2); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Read message 3: " + readMssg); + + if (!msg3.equals(readMssg)) { + logger.log(Level.SEVERE, "Message 3 received was wrong, received: " + readMssg + " expected: " + msg3); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + outstream.println(msg4); + logger.log(Level.INFO, "Sent message 4 " + msg4); + + socket.close(); + + /* + * Put Socket-MappedBuffer to state SocketHelper.STATE_SUCCESS telling the server process has ended successfully. + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + System.exit(1); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_SUCCESS); + } + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.FINE, writer.toString()); + } + + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsData.java b/test/javaTests/src/org/criu/java/tests/SocketsData.java new file mode 100644 index 0000000000..67d8cef0e0 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsData.java @@ -0,0 +1,156 @@ +package org.criu.java.tests; + +import java.io.File; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsData { + static String TESTNAME = "SocketsData"; + + /** + * Runs the server and client processes, checkpoints the client process when its in the middle of data transfer + * + * @param args Not used + */ + public static void main(String[] args) { + MappedByteBuffer b = null, socketMappedBuffer = null; + FileChannel channel; + String pid; + Logger logger = null; + String port = "49200"; + try { + /* + * Mapped buffer 'b' to communicate between CheckpointRestore.java and this process. + */ + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + pid = bean.getName(); + Helper.init(TESTNAME, pid, logger); + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + + if (b.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + /* + * Socket Mapped Buffer to communicate between server process, client process and this process. + */ + logger.log(Level.INFO, "Creating socketbufferfile and setting the init value of buffer"); + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/SocketsDataFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + /* + * Set socketMappedBuffer to init state. + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_INIT); + logger.log(Level.INFO, "Starting server and client process"); + ProcessBuilder builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsDataServer", TESTNAME, port); + Process serverProcess = builder.start(); + builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsDataClient", TESTNAME, port); + Process clientProcess = builder.start(); + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_LISTEN) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "Some error took place in the client or server process: check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "Exception occured in the client or server process: check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_CHECKPOINT) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "State is not the expected 'to be checkpointed' state"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_CHECKPOINT) { + logger.log(Level.INFO, "Going to checkpoint client process"); + try { + Thread.sleep(10); + } catch (InterruptedException e) { + logger.log(Level.WARNING, "Thread was interrupted"); + } + SocketHelper.checkpointAndWait(b, logger, serverProcess, clientProcess); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_RESTORE); + logger.log(Level.INFO, "Process has been restored!"); + } + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE) { + ; + } + char bufchar = socketMappedBuffer.getChar(Helper.MAPPED_INDEX); + if (bufchar != Helper.STATE_FAIL && bufchar != Helper.STATE_PASS && bufchar != SocketHelper.STATE_SUCCESS) { + logger.log(Level.SEVERE, "Received wrong message from the child process: not the expected finish message"); + logger.log(Level.SEVERE, "Check their log files for more details"); + serverProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Error in the client or server process: check their log for details"); + serverProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + ; + } + + /* + * Client process puts socketMappedBuffer to STATE_PASS if the test passed. + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_PASS) { + logger.log(Level.INFO, Helper.PASS_MESSAGE); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + logger.log(Level.INFO, "Did not receive pass message from the client process"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + e); + logger.log(Level.FINE, writer.toString()); + } + if (b != null) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsDataClient.java b/test/javaTests/src/org/criu/java/tests/SocketsDataClient.java new file mode 100644 index 0000000000..49885a8866 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsDataClient.java @@ -0,0 +1,141 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.net.Socket; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsDataClient { + static String TESTNAME = "SocketsDataClient"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + Socket socket = null; + String parentTestName, portArg; + int port; + Logger logger = null; + + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + String msg1 = "Ch@ckM@$$@Ge!1", msg2 = "cH@C!m$SG!!2", + readMssg, msg3 = "@Ft@rCPM$$g3", msg4 = "Aft@rCPM$$g4"; + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsDataFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + String pid = bean.getName(); + + logger.log(Level.INFO, "Client pid: " + pid); + SocketHelper.writePid(parentTestName, pid); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT && socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != SocketHelper.STATE_LISTEN) { + logger.log(Level.SEVERE, "Error: Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT) { + ; + } + /* + * Socket Mapped Buffer should be in 'Server listening for connections' state + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != SocketHelper.STATE_LISTEN) { + logger.log(Level.SEVERE, "socket-buffer not in expected state, current state: " + socketMappedBuffer.getChar(Helper.MAPPED_INDEX)); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + /* + * Server starts listening on port after putting the Mapped Buffer is in SocketHelper.STATE_LISTEN state + */ + logger.log(Level.INFO, "Client socket sending req to server at IP: 127.0.0.1 port:" + port); + + try { + socket = new Socket(SocketHelper.IP_ADDRESS, port); + } catch (IOException e) { + logger.log(Level.SEVERE, "Exception occured when connecting to port: " + e); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + PrintStream out = new PrintStream(socket.getOutputStream()); + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + + logger.log(Level.INFO, "Sending message to server " + msg1); + out.println(msg1); + + readMssg = br.readLine(); + logger.log(Level.INFO, "message received from server " + readMssg); + if (!msg2.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received; Expected " + msg2); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + /* + * Checkpoints and wait for Restore + */ + logger.log(Level.INFO, "Going to checkpoint"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); + SocketHelper.socketWaitForRestore(socketMappedBuffer, logger); + + logger.log(Level.INFO, "Sending message to server " + msg3); + out.println(msg3); + + readMssg = br.readLine(); + logger.log(Level.INFO, "message received from server " + readMssg); + if (!msg4.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received; Expected " + msg2); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + socket.close(); + /* + * Wait for server process to end. + */ + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE) { + ; + } + /* + * Check the server process has ended successfully, if it was a success put Mapped Buffer to pass state, else to failed state + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + exception); + logger.log(Level.FINE, writer.toString()); + } + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsDataServer.java b/test/javaTests/src/org/criu/java/tests/SocketsDataServer.java new file mode 100644 index 0000000000..65fe92a9d9 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsDataServer.java @@ -0,0 +1,124 @@ +package org.criu.java.tests; + +import java.io.*; +import java.net.ServerSocket; +import java.net.Socket; +import java.net.SocketException; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsDataServer { + static String TESTNAME = "SocketsDataServer"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + String parentTestName, portArg; + int port; + Socket socket = null; + Logger logger = null; + String msg1 = "Ch@ckM@$$@Ge!1", msg2 = "cH@C!m$SG!!2", + msg3 = "@Ft@rCPM$$g3", msg4 = "Aft@rCPM$$g4", readMssg; + + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsDataFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + logger.log(Level.INFO, "socket buffer connection opened"); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + ServerSocket ser = new ServerSocket(port); + logger.log(Level.INFO, "Server will be listening on Port " + port); + + /* + * Wait for 7 seconds for client to connect, else throw a timeout exception + */ + try { + ser.setSoTimeout(7 * 1000); + + } catch (SocketException e) { + logger.log(Level.SEVERE, "cannot set timeout"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + logger.log(Level.INFO, "Waiting for client to connect"); + /* + * Put Socket Mapped Buffer to SocketHelper.STATE_LISTEN state - server has bound to port and + * begin listening for connections. + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_LISTEN); + socket = ser.accept(); + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + PrintStream outstream = new PrintStream(socket.getOutputStream()); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Read message 1: " + readMssg); + + if (!msg1.equals(readMssg)) { + logger.log(Level.SEVERE, "Message 1 received was wrong:rec " + readMssg + " expected: " + msg1); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + logger.log(Level.INFO, "Sending message: " + msg2); + outstream.println(msg2); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Read message 3: " + readMssg); + + if (!msg3.equals(readMssg)) { + logger.log(Level.SEVERE, "Message 3 received was wrong:rec " + readMssg + " expected: " + msg3); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + outstream.println(msg4); + logger.log(Level.INFO, "Sent message 4 " + msg4); + + socket.close(); + /* + * Put Socket-MappedBuffer to state SocketHelper.STATE_SUCCESS telling the server process has ended successfully. + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + System.exit(1); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_SUCCESS); + } + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.FINE, writer.toString()); + } + + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsListen.java b/test/javaTests/src/org/criu/java/tests/SocketsListen.java new file mode 100644 index 0000000000..3fad385493 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsListen.java @@ -0,0 +1,153 @@ +package org.criu.java.tests; + +import java.io.File; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsListen { + static String TESTNAME = "SocketsListen"; + + /** + * Runs the client and server process, checkpoints the server process when the server has bound to a port, but has not yet started listening + * + * @param args Not used + */ + public static void main(String[] args) { + MappedByteBuffer b = null, socketMappedBuffer = null; + FileChannel channel; + String pid; + String port = "49200"; + Logger logger = null; + try { + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + /* + * Mapped buffer 'b' to communicate between CheckpointRestore.java and this process. + */ + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + pid = bean.getName(); + Helper.init(TESTNAME, pid, logger); + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + + + if (b.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Creating socketbufferfile and setting the init value of buffer"); + + /* + * Socket Mapped Buffer to communicate between server process, client process and this process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/SocketsListenFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + /* + * Set socketMappedBuffer to init state. + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_INIT); + + logger.log(Level.INFO, "Starting server and client process"); + ProcessBuilder builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsListenServer", TESTNAME, port); + Process serverProcess = builder.start(); + builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsListenClient", TESTNAME, port); + Process clientProcess = builder.start(); + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "Some error took place in the client or server process: check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "Exception occured in the client or server process: check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_CHECKPOINT) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "State is not the expected 'to be checkpointed' state"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_CHECKPOINT) { + logger.log(Level.INFO, "Going to checkpoint server process"); + SocketHelper.checkpointAndWait(b, logger, serverProcess, clientProcess); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_RESTORE); + logger.log(Level.INFO, "Process has been restored!"); + } + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_LISTEN) { + ; + } + char bufchar = socketMappedBuffer.getChar(Helper.MAPPED_INDEX); + if (bufchar != Helper.STATE_FAIL && bufchar != Helper.STATE_PASS && bufchar != SocketHelper.STATE_SUCCESS) { + logger.log(Level.SEVERE, "Received wrong message from the child process: not the expected finish message"); + logger.log(Level.SEVERE, "Check their log files for more details"); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Error in the client or server process: check their log for details"); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + ; + } + + /* + * Client process puts socketMappedBuffer to Helper.STATE_PASS-Pass state if the test passed. + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_PASS) { + logger.log(Level.INFO, Helper.PASS_MESSAGE); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + System.exit(0); + + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + e); + logger.log(Level.FINE, writer.toString()); + } + if (b != null) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsListenClient.java b/test/javaTests/src/org/criu/java/tests/SocketsListenClient.java new file mode 100644 index 0000000000..efcb3d545a --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsListenClient.java @@ -0,0 +1,136 @@ +package org.criu.java.tests; + +import java.io.*; +import java.net.Socket; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsListenClient { + static String TESTNAME = "SocketsListenClient"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + Socket socket = null; + String parentTestName, portArg; + int port; + Logger logger = null; + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + String msg1 = "Ch@ckM@$$@Ge!1", msg2 = "cH@C!m$SG!!2", readMssg, + msg3 = "@Ft@rCPM$$g3", msg4 = "Aft@rCPM$$g4"; + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsListenFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT && socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_CHECKPOINT && socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_RESTORE && socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != SocketHelper.STATE_LISTEN) { + logger.log(Level.SEVERE, "Error: Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + logger.log(Level.INFO, "Waiting for CR"); + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_CHECKPOINT) { + ; + } + + logger.log(Level.INFO, "Restored"); + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != SocketHelper.STATE_LISTEN) { + logger.log(Level.SEVERE, "Buffer does not contain the expected 'server bound to port' state" + socketMappedBuffer.getChar(Helper.MAPPED_INDEX)); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + /* + * Make the thread sleep to ensure server is listening on the port for client connections. + */ + logger.log(Level.INFO, "Put thread to sleep"); + try { + Thread.sleep(10); + } catch (InterruptedException e) { + logger.log(Level.WARNING, "Thread was interuptedp"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + logger.log(Level.INFO, "Client socket sending req to server at IP: 127.0.0.1 port:" + port); + try { + socket = new Socket(SocketHelper.IP_ADDRESS, port); + } catch (Exception e) { + logger.log(Level.SEVERE, "Exception occured when connecting to port: " + e); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + PrintStream out = new PrintStream(socket.getOutputStream()); + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + + logger.log(Level.INFO, "Sending message to server " + msg1); + out.println(msg1); + + readMssg = br.readLine(); + logger.log(Level.INFO, "message received from server " + readMssg); + if (!msg2.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received; Expected " + msg2); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + logger.log(Level.INFO, "Sending message to server " + msg3); + out.println(msg3); + + readMssg = br.readLine(); + logger.log(Level.INFO, "message received from server " + readMssg); + if (!msg4.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received; Expected " + msg4); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + socket.close(); + + /* + * Wait for server process to end. + */ + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_LISTEN) { + ; + } + /* + * Check the server process has ended successfully, if it was a success put MappedBuffer to STATE_PASS, else to STATE_FAIL + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + exception); + logger.log(Level.FINE, writer.toString()); + } + + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsListenServer.java b/test/javaTests/src/org/criu/java/tests/SocketsListenServer.java new file mode 100644 index 0000000000..46fef40ecb --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsListenServer.java @@ -0,0 +1,160 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.net.ServerSocket; +import java.net.Socket; +import java.net.SocketException; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsListenServer { + static String TESTNAME = "SocketsListenServer"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + String parentTestName, portArg; + int port; + Logger logger = null; + Socket socket = null; + String readMssg, msg1 = "Ch@ckM@$$@Ge!1", msg2 = "cH@C!m$SG!!2", + msg3 = "@Ft@rCPM$$g3", msg4 = "Aft@rCPM$$g4"; + + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsListenFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + String pid = bean.getName(); + SocketHelper.writePid(parentTestName, pid); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + logger.log(Level.INFO, "Server pid: " + pid); + logger.log(Level.INFO, "socket buffer connection opened"); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + logger.log(Level.INFO, "Server will be listening on Port " + port); + ServerSocket ser = new ServerSocket(port); + /* + * Server has bound to a port but is not listening yet! + */ + logger.log(Level.INFO, "Going to checkpoint"); + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + ser.close(); + System.exit(1); + } + /* + * Checkpoint and wait for Restore. + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); + SocketHelper.socketWaitForRestore(socketMappedBuffer, logger); + + if (!ser.isBound()) { + logger.log(Level.SEVERE, "Server is not bound to a port"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + if (ser.getLocalPort() != port) { + logger.log(Level.SEVERE, "SServer is not listening on correct port"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + /* + * Timeout after 5 sec if client does not connect + */ + try { + ser.setSoTimeout(5 * 1000); + + } catch (SocketException e) { + logger.log(Level.SEVERE, "cannot set timeout"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + try { + logger.log(Level.INFO, "Waiting for client to connect"); + /* + * Put Socket Mapped Buffer to SocketHelper.STATE_LISTEN state - server has bound to port and + * will begin listening for connections. + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_LISTEN); + socket = ser.accept(); + + } catch (Exception e) { + logger.log(Level.SEVERE, "Timed out while waiting for client to connect\n" + e); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + PrintStream outstream = new PrintStream(socket.getOutputStream()); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Read message 1: " + readMssg); + if (!msg1.equals(readMssg)) { + logger.log(Level.SEVERE, "Message 1 received was wrong:rec " + readMssg + " expected: " + msg1); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + logger.log(Level.INFO, "Sending message: " + msg2); + outstream.println(msg2); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Read message 3: " + readMssg); + + if (!msg3.equals(readMssg)) { + logger.log(Level.SEVERE, "Message 3 received was wrong:rec " + readMssg + " expected: " + msg3); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + outstream.println(msg4); + logger.log(Level.INFO, "Sending message: " + msg4); + + /* + * Put Socket-MappedBuffer to state SocketHelper.STATE_SUCCESS telling the server process has ended successfully. + */ + socket.close(); + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + System.exit(1); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_SUCCESS); + } + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.FINE, writer.toString()); + } + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsMultiple.java b/test/javaTests/src/org/criu/java/tests/SocketsMultiple.java new file mode 100644 index 0000000000..5e55c42741 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsMultiple.java @@ -0,0 +1,152 @@ +package org.criu.java.tests; + +import java.io.File; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsMultiple { + static String TESTNAME = "SocketsMultiple"; + + /** + * Runs the Client and Server Processes, Multiple clients connect to server Process, checkpoints the server process + * + * @param args Not used + */ + public static void main(String[] args) { + MappedByteBuffer b = null, socketMappedBuffer = null; + FileChannel channel; + String pid; + String port = "49200"; + Logger logger = null; + try { + /* + * Mapped buffer 'b' to communicate between CheckpointRestore.java and this process. + */ + File f = new File(Helper.MEMORY_MAPPED_FILE_NAME); + channel = FileChannel.open(f.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + b = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + pid = bean.getName(); + Helper.init(TESTNAME, pid, logger); + logger.log(Level.INFO, "Test init done; pid written to pid file; beginning with test"); + + if (b.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Error: Error in memory mapping, test is not in init state"); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + /* + * Socket Mapped Buffer to communicate between server process, client process and this process. + */ + logger.log(Level.INFO, "Creating socketBufferFile and setting the init value of buffer"); + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + TESTNAME + "/SocketsMultipleFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + /* + * Set socketMappedBuffer to init state. + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_INIT); + + logger.log(Level.INFO, "Starting server and client process"); + ProcessBuilder builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsMultipleServer", TESTNAME, port); + Process serverProcess = builder.start(); + builder = new ProcessBuilder("java", "-cp", "target/classes", Helper.PACKAGE_NAME + "." + "SocketsMultipleClient", TESTNAME, port); + Process clientProcess = builder.start(); + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_LISTEN) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "Some error took place in the client or server process: check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "Exception occured in the client or server process: check their log for details"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_CHECKPOINT) { + logger.log(Level.SEVERE, "Killing the server process and client process"); + logger.log(Level.SEVERE, "State is not the expected 'to be checkpointed' state"); + serverProcess.destroy(); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_CHECKPOINT) { + logger.log(Level.INFO, "Going to checkpoint server process"); + SocketHelper.checkpointAndWait(b, logger, serverProcess, clientProcess); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_RESTORE); + logger.log(Level.INFO, "Process has been restored!"); + } + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE) { + ; + } + char bufchar = socketMappedBuffer.getChar(Helper.MAPPED_INDEX); + if (bufchar != Helper.STATE_FAIL && bufchar != Helper.STATE_PASS && bufchar != SocketHelper.STATE_SUCCESS) { + logger.log(Level.SEVERE, "Received wrong message from the child process: not the expected finish message"); + logger.log(Level.SEVERE, "Check their log files for more details"); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + logger.log(Level.SEVERE, "Error in the client or server process: check their log for details"); + clientProcess.destroy(); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + ; + } + + /* + * Client process puts socketMappedBuffer to STATE_PASS state if the test passed. + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_PASS) { + logger.log(Level.INFO, Helper.PASS_MESSAGE); + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + System.exit(0); + + } catch (Exception e) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + e.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + e); + logger.log(Level.FINE, writer.toString()); + } + if (b != null) { + b.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + System.exit(5); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsMultipleClient.java b/test/javaTests/src/org/criu/java/tests/SocketsMultipleClient.java new file mode 100644 index 0000000000..d97a946fd2 --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsMultipleClient.java @@ -0,0 +1,174 @@ +package org.criu.java.tests; + +import java.io.*; +import java.net.Socket; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsMultipleClient { + static String TESTNAME = "SocketsMultipleClient"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + String msg1 = "Message1", msg2 = "Message2", readMssg; + Socket socket1 = null, socket2 = null, socket3 = null, socket4 = null; + String parentTestName, portArg; + int port; + Logger logger = null; + + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsMultipleFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != SocketHelper.STATE_LISTEN) { + logger.log(Level.SEVERE, "Error: Socket-buffer not in expected state"); + + } + try { + logger.log(Level.INFO, "client 1 connecting..."); + socket1 = new Socket(SocketHelper.IP_ADDRESS, port); + } catch (Exception e) { + logger.log(Level.SEVERE, "Exception when client connects to server: " + e); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + logger.log(Level.INFO, "Client 1 connected to server successfully"); + PrintStream out1 = new PrintStream(socket1.getOutputStream()); + BufferedReader br1 = new BufferedReader(new InputStreamReader(socket1.getInputStream())); + logger.log(Level.INFO, "Got input and output streams for socket1"); + try { + logger.log(Level.INFO, "client 2 connecting..."); + socket2 = new Socket(SocketHelper.IP_ADDRESS, port); + } catch (Exception e) { + logger.log(Level.SEVERE, "Exception when client connects to server: " + e); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + logger.log(Level.INFO, "Client 2 connected to server successfully"); + PrintStream out2 = new PrintStream(socket2.getOutputStream()); + BufferedReader br2 = new BufferedReader(new InputStreamReader(socket2.getInputStream())); + logger.log(Level.INFO, "Got input and output streams for socket2"); + + try { + logger.log(Level.INFO, "client 3 connecting..."); + socket3 = new Socket(SocketHelper.IP_ADDRESS, port); + } catch (Exception e) { + logger.log(Level.SEVERE, "Exception when client connects to server: " + e); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + logger.log(Level.INFO, "Client 3 connected to server successfully"); + PrintStream out3 = new PrintStream(socket3.getOutputStream()); + BufferedReader br3 = new BufferedReader(new InputStreamReader(socket3.getInputStream())); + logger.log(Level.INFO, "Got input and output streams for socket3"); + + out1.println(msg1); + + readMssg = br1.readLine(); + if (!msg2.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received; Received: " + readMssg); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + socket1.close(); + + out2.println(msg1); + + /* + * Wait for Checkpoint-Restore + */ + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_INIT || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_LISTEN || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_CHECKPOINT) { + ; + } + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_RESTORE) { + logger.log(Level.SEVERE, "Socket-mapped-buffer is not in restored state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "Server is Restored!!"); + + out3.println(msg1); + readMssg = br2.readLine(); + if (!msg2.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received by client 2; Received: " + readMssg); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + readMssg = br3.readLine(); + if (!msg2.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received by client 3; Received: " + readMssg); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + socket2.close(); + socket3.close(); + + try { + logger.log(Level.INFO, "client 4 connecting..."); + socket4 = new Socket(SocketHelper.IP_ADDRESS, port); + } catch (Exception e) { + logger.log(Level.SEVERE, "Exception when client connects to server: " + e); + } + logger.log(Level.INFO, "Client 4 connected to server successfully"); + PrintStream out4 = new PrintStream(socket4.getOutputStream()); + BufferedReader br4 = new BufferedReader(new InputStreamReader(socket4.getInputStream())); + logger.log(Level.INFO, "Got input and output streams for socket4"); + + out4.println(msg1); + readMssg = br4.readLine(); + if (!msg2.equals(readMssg)) { + logger.log(Level.SEVERE, "wrong message received by client 4; Received: " + readMssg); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + socket4.close(); + /* + * Wait for server process to end. + */ + while (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_RESTORE) { + ; + } + /* + * Check the server process has ended successfully, if it was a success put Mapped Buffer to STATE_PASS, else to STATE_FAIL + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == SocketHelper.STATE_SUCCESS) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_PASS); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occured:" + exception); + logger.log(Level.FINE, writer.toString()); + } + + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsMultipleServer.java b/test/javaTests/src/org/criu/java/tests/SocketsMultipleServer.java new file mode 100644 index 0000000000..a7e4d3b9ef --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsMultipleServer.java @@ -0,0 +1,215 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.net.ServerSocket; +import java.net.Socket; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsMultipleServer { + static String TESTNAME = "SocketsMultipleServer"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + FileChannel channel; + String parentTestName, portArg; + int port; + Logger logger = null; + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsMultipleFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + SocketHelper.init(TESTNAME, parentTestName, logger); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + String pid = bean.getName(); + SocketHelper.writePid(parentTestName, pid); + + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + logger.log(Level.INFO, "Server pid: " + pid); + logger.log(Level.INFO, "socket buffer connection opened"); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + System.exit(1); + } + + /* + * The array indexes 3, 5, 7 and 9 will map the state of client 1, 2, 3 and 4. + * Set these array indexes to init state. + */ + + socketMappedBuffer.putChar(3, Helper.STATE_INIT); + socketMappedBuffer.putChar(5, Helper.STATE_INIT); + socketMappedBuffer.putChar(7, Helper.STATE_INIT); + socketMappedBuffer.putChar(9, Helper.STATE_INIT); + + ServerSocket ser = new ServerSocket(port); + logger.log(Level.INFO, "Server will be listening on Port " + port); + + Socket[] sockets = new Socket[4]; + + /* + * Set the SocketMappedBuffer to S state-server will be listening for connections + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_LISTEN); + + for (int i = 1; i <= 4; i++) { + sockets[i - 1] = ser.accept(); + ServerThread serverThread = new ServerThread(sockets[i - 1], "ser-socket " + i, 2 * i + 1, logger, socketMappedBuffer); + serverThread.start(); + if (i == 3) { + logger.log(Level.INFO, "Connected to client: 3"); + /* + * Client 3 has connected, wait for thread 1 to finish and then checkpoint. + */ + while (socketMappedBuffer.getChar(3) != Helper.STATE_FAIL && socketMappedBuffer.getChar(3) != Helper.STATE_PASS) { + ; + } + logger.log(Level.INFO, "Going to checkpoint"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); + SocketHelper.socketWaitForRestore(socketMappedBuffer, logger); + } + } + + /* + * Loop while any of the 4 thread is running + */ + while (socketMappedBuffer.getChar(3) == Helper.STATE_INIT || socketMappedBuffer.getChar(5) == Helper.STATE_INIT + || socketMappedBuffer.getChar(7) == Helper.STATE_INIT || socketMappedBuffer.getChar(9) == Helper.STATE_INIT) { + ; + } + + /* + * Check Socket Mapped Buffer for a thread that failed + */ + for (int i = 1; i <= 4; i++) { + if (socketMappedBuffer.getChar(i * 2 + 1) == Helper.STATE_FAIL) { + logger.log(Level.SEVERE, "Error in thread connected to client " + i); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + } + + /* + * Check the 1st Socket is closed + */ + if (!sockets[0].isClosed()) { + logger.log(Level.SEVERE, "socket 1 is not closed"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + logger.log(Level.INFO, "Socket 1 is in expected closed state: " + sockets[0].isClosed()); + + /* + * Check all threads are in expected pass state + */ + for (int i = 1; i <= 4; i++) { + if (socketMappedBuffer.getChar(i * 2 + 1) != Helper.STATE_PASS) { + logger.log(Level.SEVERE, "Unexpected State of buffer: " + socketMappedBuffer.getChar(i * 2 + 1) + ", client: " + i); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + } + logger.log(Level.INFO, "Done"); + + /* + * Put Socket-MappedBuffer to state SocketHelper.STATE_SUCCESS telling the server process has ended successfully. + */ + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + System.exit(1); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_SUCCESS); + } + + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.FINE, writer.toString()); + } + + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } + } +} + +class ServerThread extends Thread { + Socket socket = null; + String name; + int num; + MappedByteBuffer socketMappedBuffer; + Logger logger; + + ServerThread(Socket socket, String name, int num, Logger logger, MappedByteBuffer socketMappedBuffer) { + this.socket = socket; + this.name = name; + this.logger = logger; + this.num = num; + this.socketMappedBuffer = socketMappedBuffer; + } + + public void run() { + try { + String readMssg, msg1 = "Message1", msg2 = "Message2"; + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + PrintStream out = new PrintStream(socket.getOutputStream()); + readMssg = br.readLine(); + if (!msg1.equals(readMssg)) { + logger.log(Level.SEVERE, "Message read by thread " + name + " was not 'Message1', received Message: " + readMssg); + socket.close(); + socketMappedBuffer.putChar(num, Helper.STATE_FAIL); + } else { + logger.log(Level.INFO, name + " received correct message"); + out.println(msg2); + logger.log(Level.INFO, name + " has sent message"); + socket.close(); + socketMappedBuffer.putChar(num, Helper.STATE_PASS); + } + + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred in thread :" + name + " " + exception); + logger.log(Level.FINE, writer.toString()); + } + + try { + if (socket != null) { + socket.close(); + } + } catch (IOException e) { + ; + } + + /* + * If exception occurs fail the thread + */ + socketMappedBuffer.putChar(num, Helper.STATE_FAIL); + } + } +} diff --git a/test/javaTests/src/org/criu/java/tests/SocketsServer.java b/test/javaTests/src/org/criu/java/tests/SocketsServer.java new file mode 100644 index 0000000000..051233443d --- /dev/null +++ b/test/javaTests/src/org/criu/java/tests/SocketsServer.java @@ -0,0 +1,142 @@ +package org.criu.java.tests; + +import java.io.*; +import java.lang.management.ManagementFactory; +import java.lang.management.RuntimeMXBean; +import java.net.ServerSocket; +import java.net.Socket; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.file.StandardOpenOption; +import java.util.logging.Level; +import java.util.logging.Logger; + +class SocketsServer { + static String TESTNAME = "SocketsServer"; + + public static void main(String[] args) { + MappedByteBuffer socketMappedBuffer = null; + String msg1 = "Ch@ckM@$$@Ge!1", msg2 = "cH@C!m$SG!!2", + msg3 = "@Ft@rCPM$$g3", msg4 = "Aft@rCPM$$g4", readMssg; + FileChannel channel; + String parentTestName, portArg; + int port; + Logger logger = null; + + try { + parentTestName = args[0]; + portArg = args[1]; + port = Integer.parseInt(portArg); + + /* + * Socket Mapped Buffer to communicate between server process, client process and the calling parent process. + */ + File socketfile = new File(Helper.OUTPUT_FOLDER_NAME + "/" + parentTestName + "/SocketsFile"); + channel = FileChannel.open(socketfile.toPath(), StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + socketMappedBuffer = channel.map(MapMode.READ_WRITE, 0, Helper.MAPPED_REGION_SIZE); + channel.close(); + + logger = Logger.getLogger(Helper.PACKAGE_NAME + "." + TESTNAME); + + SocketHelper.init(TESTNAME, parentTestName, logger); + logger.log(Level.INFO, "Begin"); + logger.log(Level.INFO, "Parent name: " + parentTestName); + + RuntimeMXBean bean = ManagementFactory.getRuntimeMXBean(); + String pid = bean.getName(); + SocketHelper.writePid(parentTestName, pid); + + logger.log(Level.INFO, "Socket buffer mapped"); + + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) != Helper.STATE_INIT) { + logger.log(Level.SEVERE, "Socket-buffer not in expected Init state"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + + ServerSocket ser = new ServerSocket(port); + logger.log(Level.INFO, "Server will be listening on Port " + port); + + /* + * Timeout after 5 second if client does not connect + */ + ser.setSoTimeout(5 * 1000); + logger.log(Level.INFO, "Waiting for client to connect"); + Socket socket = null; + try { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_LISTEN); + socket = ser.accept(); + } catch (Exception e) { + logger.log(Level.SEVERE, "Timed out while waiting for client to connect"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + BufferedReader br = new BufferedReader(new InputStreamReader(socket.getInputStream())); + PrintStream outstream = new PrintStream(socket.getOutputStream()); + + readMssg = br.readLine(); + logger.log(Level.INFO, "Read message 1: " + readMssg); + if (!msg1.equals(readMssg)) { + logger.log(Level.SEVERE, "Message 1 received was wrong:rec " + readMssg + " expected: " + msg1); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_END); + } + + logger.log(Level.INFO, "Sending message: " + msg2); + outstream.println(msg2); + + logger.log(Level.INFO, "Going to checkpoint"); + /* + * Put socket Mapped Buffer to 'to be checkpointed' state and wait for restore + */ + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_CHECKPOINT); + SocketHelper.socketWaitForRestore(socketMappedBuffer, logger); + + if (!ser.isBound()) { + logger.log(Level.SEVERE, "Server is not bound to a port"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + if (ser.getLocalPort() != port) { + logger.log(Level.SEVERE, "Server is not listening on correct port"); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + System.exit(1); + } + + readMssg = br.readLine(); + logger.log(Level.INFO, "Read message 3: " + readMssg); + + if (!msg3.equals(readMssg)) { + logger.log(Level.SEVERE, "Message 3 received was wrong:rec " + readMssg + " expected: " + msg3); + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + socket.close(); + System.exit(1); + } + + outstream.println(msg4); + logger.log(Level.INFO, "Sent message 4 " + msg4); + + /* + * Put Socket-MappedBuffer to state SocketHelper.STATE_SUCCESS telling the server process has ended successfully. + */ + socket.close(); + if (socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_FAIL || socketMappedBuffer.getChar(Helper.MAPPED_INDEX) == Helper.STATE_END) { + System.exit(1); + } else { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, SocketHelper.STATE_SUCCESS); + } + + } catch (Exception exception) { + if (null != logger) { + StringWriter writer = new StringWriter(); + PrintWriter printWriter = new PrintWriter(writer); + exception.printStackTrace(printWriter); + logger.log(Level.SEVERE, "Exception occurred:" + exception); + logger.log(Level.FINE, writer.toString()); + } + + if (socketMappedBuffer != null) { + socketMappedBuffer.putChar(Helper.MAPPED_INDEX, Helper.STATE_FAIL); + } + } + } +} diff --git a/test/javaTests/test.xml b/test/javaTests/test.xml new file mode 100644 index 0000000000..4768bf1935 --- /dev/null +++ b/test/javaTests/test.xml @@ -0,0 +1,89 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/jenkins/criu-fault.sh b/test/jenkins/criu-fault.sh index ec6d26f892..c27dd37389 100755 --- a/test/jenkins/criu-fault.sh +++ b/test/jenkins/criu-fault.sh @@ -12,6 +12,10 @@ prep ./test/zdtm.py run -t zdtm/static/vdso01 --fault 127 || fail ./test/zdtm.py run -t zdtm/static/vdso-proxy --fault 127 --iters 3 || fail +if [ "${COMPAT_TEST}" != "y" ] ; then + ./test/zdtm.py run -t zdtm/static/vdso01 --fault 133 -f h || fail +fi + ./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 2 --keep-going --report report || fail ./test/zdtm.py run -t zdtm/static/mntns_ghost --fault 4 --keep-going --report report || fail diff --git a/test/others/libcriu/Makefile b/test/others/libcriu/Makefile index 5289ed15aa..226396e6a0 100644 --- a/test/others/libcriu/Makefile +++ b/test/others/libcriu/Makefile @@ -1,3 +1,5 @@ +include ../../../../criu/Makefile.versions + TESTS += test_sub TESTS += test_self TESTS += test_notify @@ -19,8 +21,16 @@ endef $(foreach t, $(TESTS), $(eval $(call genb, $(t)))) %.o: %.c - gcc -c $^ -I../../../../criu/lib/c/ -I../../../../criu/images/ -o $@ -Werror + gcc -c $^ -iquote ../../../../criu/criu/include -I../../../../criu/lib/c/ -I../../../../criu/images/ -o $@ -Werror -clean: +clean: libcriu_clean rm -rf $(TESTS) $(TESTS:%=%.o) lib.o .PHONY: clean + +libcriu_clean: + rm -f libcriu.so.${CRIU_SO_VERSION_MAJOR} +.PHONY: libcriu_clean + +libcriu: + ln -s ../../../../criu/lib/c/libcriu.so libcriu.so.${CRIU_SO_VERSION_MAJOR} +.PHONY: libcriu diff --git a/test/others/libcriu/run.sh b/test/others/libcriu/run.sh index a99b91e52a..bd92f8544b 100755 --- a/test/others/libcriu/run.sh +++ b/test/others/libcriu/run.sh @@ -5,16 +5,15 @@ source ../env.sh || exit 1 echo "== Clean" make clean +make libcriu rm -rf wdir -rm -f ./libcriu.so.1 echo "== Prepare" mkdir -p wdir/i/ echo "== Run tests" -ln -s ../../../../criu/lib/c/libcriu.so libcriu.so.1 export LD_LIBRARY_PATH=. -export PATH="`dirname ${BASH_SOURCE[0]}`/../../:$PATH" +export PATH="`dirname ${BASH_SOURCE[0]}`/../../../criu:$PATH" RESULT=0 @@ -22,6 +21,19 @@ function run_test { echo "== Build $1" if ! make $1; then echo "FAIL build $1" + echo "** Output of $1/test.log" + cat wdir/i/$1/test.log + echo "---------------" + if [ -f wdir/i/$1/dump.log ]; then + echo "** Contents of dump.log" + cat wdir/i/$1/dump.log + echo "---------------" + fi + if [ -f wdir/i/$1/restore.log ]; then + echo "** Contents of restore.log" + cat wdir/i/$1/restore.log + echo "---------------" + fi RESULT=1; else echo "== Test $1" @@ -40,6 +52,6 @@ run_test test_iters run_test test_errno echo "== Tests done" -unlink libcriu.so.1 +make libcriu_clean [ $RESULT -eq 0 ] && echo "Success" || echo "FAIL" exit $RESULT diff --git a/test/others/rpc/config_file.py b/test/others/rpc/config_file.py index 7b07bc145f..90c80fcaea 100755 --- a/test/others/rpc/config_file.py +++ b/test/others/rpc/config_file.py @@ -1,11 +1,12 @@ #!/usr/bin/python +import argparse import os import sys -import rpc_pb2 as rpc -import argparse -from tempfile import mkstemp import time +from tempfile import mkstemp + +import rpc_pb2 as rpc from setup_swrk import setup_swrk diff --git a/test/others/unix-callback/unix-client.c b/test/others/unix-callback/unix-client.c index 69808b53cf..676c4adbc8 100644 --- a/test/others/unix-callback/unix-client.c +++ b/test/others/unix-callback/unix-client.c @@ -86,7 +86,7 @@ static int check_sock(int i) return 0; } -int main() +int main(void) { int i, fd; sigset_t set; diff --git a/test/others/unix-callback/unix-server.c b/test/others/unix-callback/unix-server.c index 8f32f53dd9..47bebd05d5 100644 --- a/test/others/unix-callback/unix-server.c +++ b/test/others/unix-callback/unix-server.c @@ -19,7 +19,7 @@ struct ticket *tickets; #define SK_NAME "/tmp/criu.unix.callback.test" -int main() +int main(void) { int sk, ret, id; char buf[4096]; diff --git a/test/zdtm.py b/test/zdtm.py index 0153c60589..b037128dfd 100755 --- a/test/zdtm.py +++ b/test/zdtm.py @@ -1,31 +1,33 @@ #!/usr/bin/env python # vim: noet ts=8 sw=8 sts=8 from __future__ import absolute_import, division, print_function, unicode_literals -from builtins import (str, open, range, zip, int, input) import argparse +import atexit +import datetime +import errno +import fcntl import glob +import linecache +import mmap import os -import subprocess -import time -import tempfile -import shutil +import random import re -import stat +import shutil import signal -import atexit -import sys -import linecache -import random +import stat import string -import fcntl -import errno -import datetime -import yaml import struct -import mmap +import subprocess +import sys +import tempfile +import time +from builtins import (input, int, open, range, str, zip) + import pycriu as crpc +import yaml + os.chdir(os.path.dirname(os.path.abspath(__file__))) prev_line = None @@ -62,6 +64,7 @@ def traceit(f, e, a): def clean_tests_root(): global tests_root if tests_root and tests_root[0] == os.getpid(): + os.rmdir(os.path.join(tests_root[1], "root")) os.rmdir(tests_root[1]) @@ -70,7 +73,9 @@ def make_tests_root(): if not tests_root: tests_root = (os.getpid(), tempfile.mkdtemp("", "criu-root-", "/tmp")) atexit.register(clean_tests_root) - return tests_root[1] + os.mkdir(os.path.join(tests_root[1], "root")) + os.chmod(tests_root[1], 0o777) + return os.path.join(tests_root[1], "root") # Report generation @@ -483,6 +488,13 @@ def start(self): # move into some semi-random state time.sleep(random.random()) + if self.__flavor.ns: + # In the case of runc the path specified with the opts.root + # option is created in /run/runc/ which is inaccessible to + # unprivileged users. The permissions here are set to test + # this use case. + os.chmod(os.path.dirname(self.__flavor.root), 0o700) + def kill(self, sig=signal.SIGKILL): self.__freezer.thaw() if self.__pid: @@ -679,9 +691,17 @@ def start(self): i = 0 for _, peer_file in self.__files: msg = self.__get_message(i) - my_file.close() try: - data = peer_file.read(16) + # File pairs naturally block on read() until the write() + # happen (or the writer is closed). This is not the case for + # regular files, so we loop. + data = b'' + while not data: + # In python 2.7, peer_file.read() doesn't call the read + # system call if it's read file to the end once. The + # next seek allows to workaround this problem. + data = os.read(peer_file.fileno(), 16) + time.sleep(0.1) except Exception as e: print("Unable to read a peer file: %s" % e) sys.exit(1) @@ -747,6 +767,11 @@ def getropts(self): fcntl.fcntl(fd, fcntl.F_SETFD, fdflags) peer_file_name = self.__peer_file_names[i] ropts.extend(["--inherit-fd", "fd[%d]:%s" % (fd, peer_file_name)]) + self.__peer_file_names = [] + self.__dump_opts = [] + for _, peer_file in self.__files: + self.__peer_file_names.append(self.__fdtyp.filename(peer_file)) + self.__dump_opts += self.__fdtyp.dump_opts(peer_file) return ropts def print_output(self): @@ -867,69 +892,57 @@ class criu_rpc: def __set_opts(criu, args, ctx): while len(args) != 0: arg = args.pop(0) - if arg == '-v4': + if "-v4" == arg: criu.opts.log_level = 4 - continue - if arg == '-o': + elif "-o" == arg: criu.opts.log_file = args.pop(0) - continue - if arg == '-D': + elif "-D" == arg: criu.opts.images_dir_fd = os.open(args.pop(0), os.O_DIRECTORY) ctx['imgd'] = criu.opts.images_dir_fd - continue - if arg == '-t': + elif "-t" == arg: criu.opts.pid = int(args.pop(0)) - continue - if arg == '--pidfile': + elif "--pidfile" == arg: ctx['pidf'] = args.pop(0) - continue - if arg == '--timeout': + elif "--timeout" == arg: criu.opts.timeout = int(args.pop(0)) - continue - if arg == '--restore-detached': - # Set by service by default - ctx['rd'] = True - continue - if arg == '--root': + elif "--restore-detached" == arg: + ctx['rd'] = True # Set by service by default + elif "--root" == arg: criu.opts.root = args.pop(0) - continue - if arg == '--external': + elif "--external" == arg: criu.opts.external.append(args.pop(0)) - continue - if arg == '--status-fd': + elif "--status-fd" == arg: fd = int(args.pop(0)) os.write(fd, b"\0") fcntl.fcntl(fd, fcntl.F_SETFD, fcntl.FD_CLOEXEC) - continue - if arg == '--port': + elif "--port" == arg: criu.opts.ps.port = int(args.pop(0)) - continue - if arg == '--address': + elif "--address" == arg: criu.opts.ps.address = args.pop(0) + elif "--page-server" == arg: continue - if arg == '--page-server': - continue - if arg == '--prev-images-dir': + elif "--prev-images-dir" == arg: criu.opts.parent_img = args.pop(0) - continue - if arg == '--track-mem': + elif "--pre-dump-mode" == arg: + key = args.pop(0) + mode = crpc.rpc.VM_READ + if key == "splice": + mode = crpc.rpc.SPLICE + criu.opts.pre_dump_mode = mode + elif "--track-mem" == arg: criu.opts.track_mem = True - continue - if arg == '--tcp-established': + elif "--tcp-established" == arg: criu.opts.tcp_established = True - continue - if arg == '--restore-sibling': + elif "--restore-sibling" == arg: criu.opts.rst_sibling = True - continue - if arg == "--inherit-fd": + elif "--inherit-fd" == arg: inhfd = criu.opts.inherit_fd.add() key = args.pop(0) fd, key = key.split(":", 1) inhfd.fd = int(fd[3:-1]) inhfd.key = key - continue - - raise test_fail_exc('RPC for %s required' % arg) + else: + raise test_fail_exc('RPC for %s(%s) required' % (arg, args.pop(0))) @staticmethod def run(action, @@ -1011,6 +1024,7 @@ def __init__(self, opts): self.__mdedup = bool(opts['noauto_dedup']) self.__user = bool(opts['user']) self.__leave_stopped = bool(opts['stop']) + self.__remote = bool(opts['remote']) self.__criu = (opts['rpc'] and criu_rpc or criu_cli) self.__show_stats = bool(opts['show_stats']) self.__lazy_pages_p = None @@ -1019,6 +1033,7 @@ def __init__(self, opts): self.__tls = self.__tls_options() if opts['tls'] else [] self.__criu_bin = opts['criu_bin'] self.__crit_bin = opts['crit_bin'] + self.__pre_dump_mode = opts['pre_dump_mode'] def fini(self): if self.__lazy_migrate: @@ -1235,6 +1250,32 @@ def dump(self, action, opts=[]): a_opts += self.__test.getdopts() + if self.__remote: + logdir = os.getcwd() + "/" + self.__dump_path + "/" + str( + self.__iter) + print("Adding image cache") + + cache_opts = [ + self.__criu_bin, "image-cache", "--port", "12345", "-v4", "-o", + logdir + "/image-cache.log", "-D", logdir + ] + + subprocess.Popen(cache_opts).pid + time.sleep(1) + + print("Adding image proxy") + + proxy_opts = [ + self.__criu_bin, "image-proxy", "--port", "12345", "--address", + "localhost", "-v4", "-o", logdir + "/image-proxy.log", "-D", + logdir + ] + + subprocess.Popen(proxy_opts).pid + time.sleep(1) + + a_opts += ["--remote"] + if self.__dedup: a_opts += ["--auto-dedup"] @@ -1249,6 +1290,8 @@ def dump(self, action, opts=[]): a_opts += ['--leave-stopped'] if self.__empty_ns: a_opts += ['--empty-ns', 'net'] + if self.__pre_dump_mode: + a_opts += ["--pre-dump-mode", "%s" % self.__pre_dump_mode] nowait = False if self.__lazy_migrate and action == "dump": @@ -1287,6 +1330,9 @@ def restore(self): r_opts += ['--empty-ns', 'net'] r_opts += ['--action-script', os.getcwd() + '/empty-netns-prep.sh'] + if self.__remote: + r_opts += ["--remote"] + if self.__dedup: r_opts += ["--auto-dedup"] @@ -1834,8 +1880,8 @@ def run_test(self, name, desc, flavor): 'stop', 'empty_ns', 'fault', 'keep_img', 'report', 'snaps', 'sat', 'script', 'rpc', 'lazy_pages', 'join_ns', 'dedup', 'sbs', 'freezecg', 'user', 'dry_run', 'noauto_dedup', - 'remote_lazy_pages', 'show_stats', 'lazy_migrate', - 'tls', 'criu_bin', 'crit_bin') + 'remote_lazy_pages', 'show_stats', 'lazy_migrate', 'remote', + 'tls', 'criu_bin', 'crit_bin', 'pre_dump_mode') arg = repr((name, desc, flavor, {d: self.__opts[d] for d in nd})) if self.__use_log: @@ -2018,7 +2064,7 @@ def print_sep(title, sep="=", width=80): def print_error(line): line = line.rstrip() - print(line) + print(line.encode('utf-8')) if line.endswith('>'): # combine pie output return True return False @@ -2028,7 +2074,7 @@ def grep_errors(fname): first = True print_next = False before = [] - with open(fname) as fd: + with open(fname, errors='replace') as fd: for l in fd: before.append(l) if len(before) > 5: @@ -2482,6 +2528,10 @@ def clean_stuff(opts): rp.add_argument("--crit-bin", help="Path to crit binary", default='../crit/crit') +rp.add_argument("--pre-dump-mode", + help="Use splice or read mode of pre-dumping", + choices=['splice', 'read'], + default='splice') lp = sp.add_parser("list", help="List tests") lp.set_defaults(action=list_tests) diff --git a/test/zdtm/Makefile.inc b/test/zdtm/Makefile.inc index 170f31632e..43763321f9 100644 --- a/test/zdtm/Makefile.inc +++ b/test/zdtm/Makefile.inc @@ -1,41 +1,47 @@ .SUFFIXES: MAKEFLAGS += -r -ARCH ?= $(shell uname -m | sed \ - -e s/i.86/x86/ \ - -e s/x86_64/x86/ \ - -e s/sun4u/sparc64/ \ - -e s/arm.*/arm/ \ - -e s/sa110/arm/ \ - -e s/s390x/s390/ \ - -e s/parisc64/parisc/ \ - -e s/ppc64.*/ppc64/ \ - -e s/mips.*/mips/ \ - -e s/sh[234].*/sh/ \ +SUBARCH ?= $(shell uname -m) +ARCH ?= $(shell echo $(SUBARCH) | sed \ + -e s/i.86/x86/ \ + -e s/x86_64/x86/ \ + -e s/sun4u/sparc64/ \ + -e s/arm.*/arm/ \ + -e s/sa110/arm/ \ + -e s/s390x/s390/ \ + -e s/parisc64/parisc/ \ + -e s/ppc64.*/ppc64/ \ + -e s/mips.*/mips/ \ + -e s/sh[234].*/sh/ \ -e s/aarch64.*/arm64/) ifeq ($(ARCH),arm64) - ARCH ?= aarch64 - SRCARCH ?= aarch64 + ARCH := aarch64 endif -SRCARCH ?= $(ARCH) - ifeq ($(ARCH),arm) - ARMV := $(shell echo $(UNAME-M) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') - - ifeq ($(ARMV),6) - USERCFLAGS += -march=armv6 - else ifeq ($(ARMV),7) - USERCFLAGS += -march=armv7-a - endif + ARMV := $(shell echo $(SUBARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7') + + ifeq ($(ARMV),6) + USERCFLAGS += -march=armv6 + else ifeq ($(ARMV),7) + USERCFLAGS += -march=armv7-a + else ifeq ($(ARMV),8) + # To build aarch32 on armv8 Travis-CI (see criu Makefile) + USERCFLAGS += -march=armv7-a + ARMV := 7 + endif endif -CC := gcc +HOSTCC ?= gcc +ifeq ($(origin CC), default) + CC := $(CROSS_COMPILE)$(HOSTCC) +endif CFLAGS += -g -O2 -Wall -Werror -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=0 +CFLAGS += -Wdeclaration-after-statement -Wstrict-prototypes CFLAGS += $(USERCFLAGS) CFLAGS += -D_GNU_SOURCE -CPPFLAGS += -iquote $(LIBDIR)/arch/$(SRCARCH)/include +CPPFLAGS += -iquote $(LIBDIR)/arch/$(ARCH)/include ifeq ($(strip $(V)),) E = @echo @@ -48,12 +54,25 @@ endif RM := rm -f --one-file-system ifeq ($(COMPAT_TEST),y) + # Firstly look for 32-bit libs and then in standard path. + PKG_CONFIG_PATH := $(shell pkg-config --variable pc_path pkg-config) + PKG_CONFIG_PATH := /usr/lib32/pkgconfig:$(PKG_CONFIG_PATH) ifeq ($(ARCH),x86) export CFLAGS += -m32 export LDFLAGS += -m32 + PKG_CONFIG_PATH := /usr/lib/i386-linux-gnu/pkgconfig:$(PKG_CONFIG_PATH) endif + export PKG_CONFIG_PATH endif +define pkg-libs + $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" pkg-config --libs $(1)) +endef + +define pkg-cflags + $(shell PKG_CONFIG_PATH="$(PKG_CONFIG_PATH)" pkg-config --cflags $(1)) +endef + %.d: %.c $(E) " DEP " $@ $(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -MM -MP -c $< -o $@ diff --git a/test/zdtm/lib/Makefile b/test/zdtm/lib/Makefile index d2d9f1cc31..89ca909332 100644 --- a/test/zdtm/lib/Makefile +++ b/test/zdtm/lib/Makefile @@ -4,7 +4,7 @@ CFLAGS += $(USERCFLAGS) LIB := libzdtmtst.a -LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c fs.c +LIBSRC := datagen.c msg.c parseargs.c test.c streamutil.c lock.c ns.c tcp.c unix.c fs.c sysctl.c LIBOBJ := $(LIBSRC:%.c=%.o) BIN := groups diff --git a/test/zdtm/lib/fs.c b/test/zdtm/lib/fs.c index 0decfc37b7..e82011ec86 100644 --- a/test/zdtm/lib/fs.c +++ b/test/zdtm/lib/fs.c @@ -94,3 +94,27 @@ mnt_info_t *get_cwd_mnt_info(void) mnt_info_free(&m); goto out; } + +int get_cwd_check_perm(char **result) +{ + char *cwd; + *result = 0; + cwd = get_current_dir_name(); + if (!cwd) { + pr_perror("failed to get current directory"); + return -1; + } + + if (access(cwd, X_OK)) { + pr_err("access check for bit X for current dir path '%s' " + "failed for uid:%d,gid:%d, error: %d(%s). " + "Bit 'x' should be set in all path components of " + "this directory\n", + cwd, getuid(), getgid(), errno, strerror(errno) + ); + return -1; + } + + *result = cwd; + return 0; +} diff --git a/test/zdtm/lib/fs.h b/test/zdtm/lib/fs.h index 972b15abad..af7a665fb3 100644 --- a/test/zdtm/lib/fs.h +++ b/test/zdtm/lib/fs.h @@ -50,4 +50,28 @@ extern mnt_info_t *mnt_info_alloc(void); extern void mnt_info_free(mnt_info_t **m); extern mnt_info_t *get_cwd_mnt_info(void); +/* + * get_cwd_check_perm is called to check that cwd is actually usable for a calling + * process. + * + * Example output of a stat command on a '/root' path shows file access bits: + * > stat /root + * File: ‘/root’ + * ... + * Access: (0550/dr-xr-x---) Uid: ( 0/root) Gid: ( 0/root) + * ^- no 'x' bit for other + * + * Here we can see that '/root' dir (that often can be part of cwd path) does not + * allow non-root user and non-root group to list contents of this directory. + * Calling process matching 'other' access category may succeed getting cwd path, but will + * fail performing further filesystem operations based on this path with confusing errors. + * + * This function calls get_current_dir_name and explicitly checks that bit 'x' is enabled for + * a calling process and logs the error. + * + * If check passes, stores get_current_dir's result in *result and returns 0 + * If check fails, stores 0 in *result and returns -1 + */ +extern int get_cwd_check_perm(char **result); + #endif /* ZDTM_FS_H_ */ diff --git a/test/zdtm/lib/parseargs.c b/test/zdtm/lib/parseargs.c index 7e411f6b6c..d8aa4ed639 100644 --- a/test/zdtm/lib/parseargs.c +++ b/test/zdtm/lib/parseargs.c @@ -113,8 +113,8 @@ static void helpexit(void) exit(1); } -const char *test_doc; -const char *test_author; +const char __attribute__((weak)) *test_doc; +const char __attribute__((weak)) *test_author; static void prdoc(void) { diff --git a/test/zdtm/lib/sysctl.c b/test/zdtm/lib/sysctl.c new file mode 100644 index 0000000000..9583ec3df5 --- /dev/null +++ b/test/zdtm/lib/sysctl.c @@ -0,0 +1,59 @@ +#include + +#include "zdtmtst.h" +#include "sysctl.h" + +int sysctl_read_int(const char *name, int *data) +{ + int fd; + int ret; + char buf[16]; + + fd = open(name, O_RDONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return fd; + } + + ret = read(fd, buf, sizeof(buf) - 1); + if (ret < 0) { + pr_perror("Can't read %s", name); + ret = -errno; + goto err; + } + + buf[ret] = '\0'; + + *data = (int)strtoul(buf, NULL, 10); + ret = 0; +err: + close(fd); + return ret; +} + +int sysctl_write_int(const char *name, int val) +{ + int fd; + int ret; + char buf[16]; + + fd = open(name, O_WRONLY); + if (fd < 0) { + pr_perror("Can't open %s", name); + return fd; + } + + sprintf(buf, "%d\n", val); + + ret = write(fd, buf, strlen(buf)); + if (ret < 0) { + pr_perror("Can't write %d into %s", val, name); + ret = -errno; + goto err; + } + + ret = 0; +err: + close(fd); + return ret; +} diff --git a/test/zdtm/lib/sysctl.h b/test/zdtm/lib/sysctl.h new file mode 100644 index 0000000000..67129102fe --- /dev/null +++ b/test/zdtm/lib/sysctl.h @@ -0,0 +1,7 @@ +#ifndef __ZDTM_SYSCTL__ +#define __ZDTM_SYSCTL__ + +extern int sysctl_read_int(const char *name, int *data); +extern int sysctl_write_int(const char *name, int val); + +#endif diff --git a/test/zdtm/lib/test.c b/test/zdtm/lib/test.c index a1bdfc1b4f..630476de0e 100644 --- a/test/zdtm/lib/test.c +++ b/test/zdtm/lib/test.c @@ -71,7 +71,7 @@ static void test_fini(void) unlinkat(cwd, pidfile, 0); } -static void setup_outfile() +static void setup_outfile(void) { if (!access(outfile, F_OK) || errno != ENOENT) { fprintf(stderr, "Output file %s appears to exist, aborting\n", @@ -93,7 +93,7 @@ static void setup_outfile() exit(1); } -static void redir_stdfds() +static void redir_stdfds(void) { int nullfd; @@ -346,7 +346,7 @@ void test_init(int argc, char **argv) srand48(time(NULL)); /* just in case we need it */ } -void test_daemon() +void test_daemon(void) { futex_set_and_wake(&test_shared_state->stage, TEST_RUNNING_STAGE); } diff --git a/test/zdtm/lib/unix.c b/test/zdtm/lib/unix.c new file mode 100644 index 0000000000..c36846cadd --- /dev/null +++ b/test/zdtm/lib/unix.c @@ -0,0 +1,19 @@ +#include +#include +#include "zdtmtst.h" +#include "fs.h" + +int unix_fill_sock_name(struct sockaddr_un *name, char *relFilename) +{ + char *cwd; + + if (get_cwd_check_perm(&cwd)) { + pr_err("failed to get current working directory with valid permissions.\n"); + return -1; + } + + name->sun_family = AF_LOCAL; + ssprintf(name->sun_path, "%s/%s", cwd, relFilename); + return 0; +} + diff --git a/test/zdtm/lib/zdtmtst.h b/test/zdtm/lib/zdtmtst.h index 1fbf795bf8..bf9e21bf40 100644 --- a/test/zdtm/lib/zdtmtst.h +++ b/test/zdtm/lib/zdtmtst.h @@ -149,12 +149,18 @@ extern int tcp_init_server(int family, int *port); extern int tcp_accept_server(int sock); extern int tcp_init_client(int family, char *servIP, unsigned short servPort); +struct sockaddr_un; +extern int unix_fill_sock_name(struct sockaddr_un *name, char *relFilename); + struct zdtm_tcp_opts { bool reuseaddr; bool reuseport; int flags; }; +extern const char *test_author; +extern const char *test_doc; + extern int tcp_init_server_with_opts(int family, int *port, struct zdtm_tcp_opts *opts); extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *parent_tid, void *child_tid, unsigned long newtls); @@ -168,4 +174,9 @@ extern pid_t sys_clone_unified(unsigned long flags, void *child_stack, void *par ___ret; \ }) +#define sstrncpy(d, s) ({ \ + strncpy(d, s, sizeof(d)-1); \ + d[sizeof(d)-1] = '\0'; \ +}) + #endif /* _VIMITESU_H_ */ diff --git a/test/zdtm/static/Makefile b/test/zdtm/static/Makefile index d8279d6f84..e3ee397d7b 100644 --- a/test/zdtm/static/Makefile +++ b/test/zdtm/static/Makefile @@ -105,7 +105,8 @@ TST_NOFILE := \ socket-tcp-unconn \ socket-tcp6-unconn \ socket-tcp-syn-sent \ - socket-tcp-skip-in-flight \ + socket-tcp-skip-in-flight \ + socket-tcp-keepalive \ sock_opts00 \ sock_opts01 \ sk-unix-unconn \ @@ -121,6 +122,7 @@ TST_NOFILE := \ groups \ pdeath_sig \ file_fown \ + file_cloexec \ proc-self \ eventfs00 \ epoll \ @@ -207,6 +209,7 @@ TST_NOFILE := \ pipe03 \ netns_sub \ netns_sub_veth \ + netns_sub_sysctl \ unlink_multiple_largefiles \ config_inotify_irmap \ thp_disable \ @@ -217,15 +220,21 @@ TST_NOFILE := \ child_subreaper \ child_subreaper_existing_child \ child_subreaper_and_reparent \ + memfd00 \ + memfd01 \ + memfd02 \ + memfd03 \ + shmemfd \ + shmemfd-priv \ # jobctl00 \ -ifneq ($(SRCARCH),arm) +ifneq ($(ARCH),arm) ifneq ($(COMPAT_TEST),y) TST_NOFILE += maps03 endif endif -ifeq ($(SRCARCH),s390) +ifeq ($(ARCH),s390) TST_NOFILE += s390x_regs_check \ s390x_gs_threads \ s390x_runtime_instr @@ -286,6 +295,7 @@ TST_FILE = \ file_locks07 \ file_locks08 \ netns-nf \ + netns-nft \ maps_file_prot \ socket_close_data01 \ @@ -319,7 +329,8 @@ TST_DIR = \ cgroup03 \ cgroup04 \ cgroup_ifpriomap \ - cgroup_stray \ + cgroup_stray \ + cgroup_yard \ unlink_fstat04 \ unlink_fstat041 \ mntns_remap \ @@ -348,12 +359,16 @@ TST_DIR = \ del_standalone_un \ sk-unix-mntns \ sk-unix01 \ + bind-mount-unix \ unsupported_children_collision \ shared_slave_mount_children \ non_uniform_share_propagation \ private_bind_propagation \ ghost_on_rofs \ overmounted_file \ + opath_file \ + symlink \ + symlink01 \ TST_DIR_FILE = \ chroot \ @@ -527,8 +542,9 @@ stopped12: CFLAGS += -DZDTM_STOPPED_KILL -DZDTM_STOPPED_TKILL clone_fs: LDLIBS += -pthread # As generating dependencies won't work without proper includes, # we have to explicitly specify both .o and .d for this case: -netns_sub_veth.o netns_sub_veth.d: CPPFLAGS += -I/usr/include/libnl3 -netns_sub_veth: LDLIBS += -lnl-3 -l nl-route-3 +netns_sub_veth.o netns_sub_veth.d: CPPFLAGS += $(call pkg-cflags, libnl-3.0) +netns_sub_veth: LDLIBS += $(call pkg-libs, libnl-route-3.0 libnl-3.0) +symlink01: CFLAGS += -DZDTM_UNLINK_SYMLINK socket-tcp-fin-wait1: CFLAGS += -D ZDTM_TCP_FIN_WAIT1 socket-tcp-fin-wait2: CFLAGS += -D ZDTM_TCP_FIN_WAIT2 diff --git a/test/zdtm/static/apparmor.c b/test/zdtm/static/apparmor.c index 15930c7618..b3a4d75498 100644 --- a/test/zdtm/static/apparmor.c +++ b/test/zdtm/static/apparmor.c @@ -15,7 +15,7 @@ const char *test_author = "Tycho Andersen "; #define PROFILE "criu_test" -int setprofile() +int setprofile(void) { char profile[1024]; int fd, len; @@ -45,7 +45,7 @@ int setprofile() return 0; } -int checkprofile() +int checkprofile(void) { FILE *f; char path[PATH_MAX], profile[1024]; diff --git a/test/zdtm/static/arm-neon00.c b/test/zdtm/static/arm-neon00.c index 96da16c6b0..ce8123e515 100644 --- a/test/zdtm/static/arm-neon00.c +++ b/test/zdtm/static/arm-neon00.c @@ -12,13 +12,14 @@ const char *test_author = "Alexander Karatshov "; int main(int argc, char ** argv) { + int a, b, c, y1, y2; + srand(time(0)); - int a = rand() % 100; - int b = rand() % 100; - int c = rand() % 100; - int y1 = a + b*c; - int y2; + a = rand() % 100; + b = rand() % 100; + c = rand() % 100; + y1 = a + b*c; test_init(argc, argv); diff --git a/test/zdtm/static/bind-mount-unix.c b/test/zdtm/static/bind-mount-unix.c new file mode 100644 index 0000000000..7f649ed70d --- /dev/null +++ b/test/zdtm/static/bind-mount-unix.c @@ -0,0 +1,177 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check bind-mounts with unix socket"; +const char *test_author = "Cyrill Gorcunov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char path_unix[PATH_MAX], path_bind[PATH_MAX]; + char unix_name[] = "criu-log"; + char bind_name[] = "criu-bind-log"; + int sk = -1, skc = -1, ret = 1, fd; + struct sockaddr_un addr; + unsigned int addrlen; + task_waiter_t t; + struct stat st; + int status; + pid_t pid; + + char buf[] = "123456"; + char rbuf[sizeof(buf)]; + + test_init(argc, argv); + task_waiter_init(&t); + + mkdir(dirname, 0700); + if (mount("none", dirname, "tmpfs", 0, NULL)) { + pr_perror("Unable to mount %s", dirname); + return 1; + } + + ssprintf(path_bind, "%s/%s", dirname, bind_name); + ssprintf(path_unix, "%s/%s", dirname, unix_name); + + unlink(path_bind); + unlink(path_unix); + + fd = open(path_bind, O_RDONLY | O_CREAT); + if (fd < 0) { + pr_perror("Can't open %s", path_bind); + return 1; + } + close(fd); + + addr.sun_family = AF_UNIX; + sstrncpy(addr.sun_path, path_unix); + addrlen = sizeof(addr.sun_family) + strlen(path_unix); + + sk = socket(AF_UNIX, SOCK_DGRAM, 0); + if (sk < 0) { + pr_perror("Can't create socket %s", path_unix); + return 1; + } + + ret = bind(sk, (struct sockaddr *)&addr, addrlen); + if (ret) { + pr_perror("Can't bind socket %s", path_unix); + return 1; + } + + if (stat(path_unix, &st) == 0) { + test_msg("path %s st.st_ino %#lx st.st_mode 0%o (sock %d)\n", + path_unix, (unsigned long)st.st_ino, + (int)st.st_mode, !!S_ISSOCK(st.st_mode)); + } else { + pr_perror("Can't stat on %s", path_unix); + return 1; + } + + if (mount(path_unix, path_bind, NULL, MS_BIND | MS_REC, NULL)) { + pr_perror("Unable to bindmount %s -> %s", path_unix, path_bind); + return 1; + } + + if (stat(path_unix, &st) == 0) { + test_msg("path %s st.st_dev %#x st.st_rdev %#x st.st_ino %#lx st.st_mode 0%o (sock %d)\n", + path_unix, (int)st.st_dev, (int)st.st_rdev, (unsigned long)st.st_ino, + (int)st.st_mode, !!S_ISSOCK(st.st_mode)); + } else { + pr_perror("Can't stat on %s", path_unix); + return 1; + } + + if (stat(path_bind, &st) == 0) { + test_msg("path %s st.st_dev %#x st.st_rdev %#x st.st_ino %#lx st.st_mode 0%o (sock %d)\n", + path_bind, (int)st.st_dev, (int)st.st_rdev, (unsigned long)st.st_ino, + (int)st.st_mode, !!S_ISSOCK(st.st_mode)); + } else { + pr_perror("Can't stat on %s", path_bind); + return 1; + } + + pid = test_fork(); + if (pid < 0) { + pr_perror("Can't fork"); + return 1; + } else if (pid == 0) { + skc = socket(AF_UNIX, SOCK_DGRAM, 0); + if (skc < 0) { + pr_perror("Can't create client socket"); + _exit(1); + } + + addr.sun_family = AF_UNIX; + sstrncpy(addr.sun_path, path_bind); + addrlen = sizeof(addr.sun_family) + strlen(path_bind); + + ret = connect(skc, (struct sockaddr *)&addr, addrlen); + if (ret) { + pr_perror("Can't connect\n"); + _exit(1); + } else + test_msg("Connected to %s", addr.sun_path); + + task_waiter_complete(&t, 1); + task_waiter_wait4(&t, 2); + + ret = sendto(skc, buf, sizeof(buf), 0, (struct sockaddr *)&addr, addrlen); + if (ret != (int)sizeof(buf)) { + pr_perror("Can't send data on client"); + _exit(1); + } + + close(skc); + _exit(0); + } + + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + task_waiter_complete(&t, 2); + + ret = read(sk, rbuf, sizeof(rbuf)); + if (ret < 0) { + fail("Can't read data"); + goto err; + } + + if (ret != sizeof(buf) || memcmp(buf, rbuf, sizeof(buf))) { + fail("Data mismatch"); + goto err; + } + + ret = wait(&status); + if (ret == -1 || !WIFEXITED(status) || WEXITSTATUS(status)) { + kill(pid, SIGKILL); + fail("Unable to wait child"); + } else { + ret = 0; + pass(); + } + +err: + umount2(path_bind, MNT_DETACH); + umount2(dirname, MNT_DETACH); + unlink(path_bind); + unlink(path_unix); + close(sk); + + return ret ? 1 : 0; +} diff --git a/test/zdtm/static/bind-mount-unix.desc b/test/zdtm/static/bind-mount-unix.desc new file mode 100644 index 0000000000..a8849e0970 --- /dev/null +++ b/test/zdtm/static/bind-mount-unix.desc @@ -0,0 +1 @@ +{'flavor': 'ns uns', 'flags': 'suid', 'feature': 'mnt_id'} diff --git a/test/zdtm/static/cgroup_yard.c b/test/zdtm/static/cgroup_yard.c new file mode 120000 index 0000000000..f3683c2b43 --- /dev/null +++ b/test/zdtm/static/cgroup_yard.c @@ -0,0 +1 @@ +cgroup00.c \ No newline at end of file diff --git a/test/zdtm/static/cgroup_yard.desc b/test/zdtm/static/cgroup_yard.desc new file mode 100644 index 0000000000..8736d6780d --- /dev/null +++ b/test/zdtm/static/cgroup_yard.desc @@ -0,0 +1,7 @@ +{ +'flavor': 'h', +'flags': 'suid', +# We create the external cgroup yard in working directory during --pre-dump +# hook. We have to go up a few directories to find the yard. +'opts': '--manage-cgroups --cgroup-yard ../../../../../../external_yard' +} diff --git a/test/zdtm/static/cgroup_yard.hook b/test/zdtm/static/cgroup_yard.hook new file mode 100755 index 0000000000..072b9d38d7 --- /dev/null +++ b/test/zdtm/static/cgroup_yard.hook @@ -0,0 +1,51 @@ +#!/usr/bin/env python + +import sys +import os +import subprocess +import tempfile + +yard = "external_yard" + +if sys.argv[1] == "--post-start": + ''' + Create external cgroup yard to be passed to CRIU via --cgroup-yard + ''' + os.mkdir(yard) + subprocess.check_call(["mount", "-t", "tmpfs", "zdtm_yard", yard]) + with open("/proc/self/cgroup") as f: + for line in f: + cgr = line.split(":")[1] + + if cgr == "": + continue + + if cgr.startswith("name="): + ctrl = cgr[len("name="):] + opts = "none," + cgr + else: + ctrl = cgr + opts = cgr + + os.mkdir(yard + "/" + ctrl) + subprocess.check_call(["mount", "-t", "cgroup", "none", yard + "/" + ctrl, "-o", opts]) + +if sys.argv[1] in ["--pre-restore", "--clean"]: + ''' + Clean up the leftover cgroups created by the test + ''' + tname = tempfile.mkdtemp() + subprocess.call(["mount", "-t", "cgroup", "none", tname, "-o", "none,name=zdtmtst"]) + + for cg in [os.path.join(tname, "subcg00", "subsubcg"), + os.path.join(tname, "subcg00")]: + if os.access(cg, os.F_OK): + os.rmdir(cg) + + subprocess.call(["umount", tname]) + os.rmdir(tname) + +if sys.argv[1] == "--clean": + if os.access(yard, os.F_OK): + subprocess.call(["umount", "-l", yard]) + os.rmdir(yard) diff --git a/test/zdtm/static/child_subreaper.c b/test/zdtm/static/child_subreaper.c index 267795249b..6d02c9f933 100644 --- a/test/zdtm/static/child_subreaper.c +++ b/test/zdtm/static/child_subreaper.c @@ -8,10 +8,11 @@ const char *test_author = "Michał Cłapiński "; int main(int argc, char **argv) { + int cs_before = 1, cs_after, ret; + test_init(argc, argv); - int cs_before = 1; - int ret = prctl(PR_SET_CHILD_SUBREAPER, cs_before, 0, 0, 0); + ret = prctl(PR_SET_CHILD_SUBREAPER, cs_before, 0, 0, 0); if (ret) { pr_perror("Can't set child subreaper attribute, err = %d", ret); exit(1); @@ -20,7 +21,6 @@ int main(int argc, char **argv) test_daemon(); test_waitsig(); - int cs_after; ret = prctl(PR_GET_CHILD_SUBREAPER, (unsigned long)&cs_after, 0, 0, 0); if (ret) { pr_perror("Can't get child subreaper attribute, err = %d", ret); diff --git a/test/zdtm/static/child_subreaper_and_reparent.c b/test/zdtm/static/child_subreaper_and_reparent.c index 57943a67b7..e3955d3d94 100644 --- a/test/zdtm/static/child_subreaper_and_reparent.c +++ b/test/zdtm/static/child_subreaper_and_reparent.c @@ -25,7 +25,7 @@ struct shared { int parent_after_cr; } *sh; -int orphan() +int orphan(void) { /* * Wait until reparented to the pidns init. (By waiting @@ -45,7 +45,7 @@ int orphan() return 0; } -int helper() +int helper(void) { int pid; @@ -59,7 +59,7 @@ int helper() return 0; } -int subreaper() +int subreaper(void) { int pid, ret, status; diff --git a/test/zdtm/static/child_subreaper_existing_child.c b/test/zdtm/static/child_subreaper_existing_child.c index 28e9dbb8ae..8291aba087 100644 --- a/test/zdtm/static/child_subreaper_existing_child.c +++ b/test/zdtm/static/child_subreaper_existing_child.c @@ -24,7 +24,7 @@ struct shared { } *sh; -int orphan() +int orphan(void) { /* Return the control back to MAIN worker to do C/R */ futex_set_and_wake(&sh->fstate, TEST_CRIU); @@ -36,7 +36,7 @@ int orphan() return 0; } -int helper() +int helper(void) { int pid; @@ -52,7 +52,7 @@ int helper() return 0; } -int subreaper() +int subreaper(void) { int pid, ret, status; diff --git a/test/zdtm/static/config_inotify_irmap.c b/test/zdtm/static/config_inotify_irmap.c index 831dc19741..3cbeba7d38 100644 --- a/test/zdtm/static/config_inotify_irmap.c +++ b/test/zdtm/static/config_inotify_irmap.c @@ -31,6 +31,7 @@ char test_files[2][128] = {TDIR"/zdtm-test", TDIR"/zdtm-test1",}; int main (int argc, char *argv[]) { + FILE *configfile; char buf[BUFF_SIZE]; int fd, wd, i; @@ -56,7 +57,7 @@ int main (int argc, char *argv[]) } } - FILE *configfile = fopen(CONFIG_PATH, "w"); + configfile = fopen(CONFIG_PATH, "w"); if (configfile == NULL) { pr_perror("Unable to create configuration file %s", CONFIG_PATH); goto err; diff --git a/test/zdtm/static/conntracks b/test/zdtm/static/conntracks index a30e0e2685..26220f97c0 100755 --- a/test/zdtm/static/conntracks +++ b/test/zdtm/static/conntracks @@ -23,7 +23,7 @@ do_or_fail() fail "$failmsg: $output" } -do_start() +do_start_ipt() { [ -f "$statefile" ] && die "state file $statefile aleady exists" @@ -35,7 +35,7 @@ do_start() iptables -L \> "$statefile" } -do_stop() +do_stop_ipt() { do_or_fail "can't compare the iptables" \ iptables -L \| diff -u "$statefile" - @@ -45,6 +45,38 @@ do_stop() echo "PASS" > $outfile } +do_start_nft() +{ + [ -f "$statefile" ] && die "state file $statefile aleady exists" + + do_or_fail "can't install a state match" \ + nft add rule filter INPUT \ + ct state related,established accept + + do_or_fail "can't list the loaded nftables" \ + nft list ruleset \> "$statefile" +} + +do_stop_nft() +{ + do_or_fail "can't compare the nftables" \ + nft list ruleset \| diff -u "$statefile" - + + rm -f "$statefile" + + echo "PASS" > $outfile +} + +do_start() +{ + [ -x "$(command -v nft)" ] && do_start_nft || do_start_ipt +} + +do_stop() +{ + [ -x "$(command -v nft)" ] && do_stop_nft || do_stop_ipt +} + tmpargs="$(../lib/parseargs.sh --name=$0 \ --flags-req=statefile,outfile \ --flags-opt="start,stop" -- "$@")" || diff --git a/test/zdtm/static/del_standalone_un.c b/test/zdtm/static/del_standalone_un.c index d8200068be..5426fc7865 100644 --- a/test/zdtm/static/del_standalone_un.c +++ b/test/zdtm/static/del_standalone_un.c @@ -16,19 +16,6 @@ const char *test_author = "Tycho Andersen "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); -static int fill_sock_name(struct sockaddr_un *name, const char *filename) -{ - char *cwd; - - cwd = get_current_dir_name(); - if (strlen(filename) + strlen(cwd) + 1 >= sizeof(name->sun_path)) - return -1; - - name->sun_family = AF_LOCAL; - ssprintf(name->sun_path, "%s/%s", cwd, filename); - return 0; -} - static int bind_and_listen(struct sockaddr_un *addr) { int sk; @@ -71,10 +58,8 @@ int main(int argc, char **argv) goto out; } - if (fill_sock_name(&addr, filename) < 0) { - pr_err("filename \"%s\" is too long\n", filename); + if (unix_fill_sock_name(&addr, filename)) goto out; - } sk1 = bind_and_listen(&addr); if (sk1 < 0) diff --git a/test/zdtm/static/deleted_unix_sock.c b/test/zdtm/static/deleted_unix_sock.c index bcc33f3dec..4d328e9966 100644 --- a/test/zdtm/static/deleted_unix_sock.c +++ b/test/zdtm/static/deleted_unix_sock.c @@ -17,28 +17,13 @@ const char *test_author = "Roman Kagan "; char *filename; TEST_OPTION(filename, string, "file name", 1); -static int fill_sock_name(struct sockaddr_un *name, const char *filename) -{ - char *cwd; - - cwd = get_current_dir_name(); - if (strlen(filename) + strlen(cwd) + 1 >= sizeof(name->sun_path)) - return -1; - - name->sun_family = AF_LOCAL; - sprintf(name->sun_path, "%s/%s", cwd, filename); - return 0; -} - static int setup_srv_sock(void) { struct sockaddr_un name; int sock; - if (fill_sock_name(&name, filename) < 0) { - pr_perror("filename \"%s\" is too long", filename); + if (unix_fill_sock_name(&name, filename)) return -1; - } sock = socket(PF_LOCAL, SOCK_STREAM, 0); if (sock < 0) { @@ -67,7 +52,7 @@ static int setup_clnt_sock(void) struct sockaddr_un name; int sock; - if (fill_sock_name(&name, filename) < 0) + if (unix_fill_sock_name(&name, filename)) return -1; sock = socket(PF_LOCAL, SOCK_STREAM, 0); diff --git a/test/zdtm/static/dumpable02.c b/test/zdtm/static/dumpable02.c index 024371bd88..7e2eee2d1e 100644 --- a/test/zdtm/static/dumpable02.c +++ b/test/zdtm/static/dumpable02.c @@ -13,7 +13,7 @@ const char *test_doc = "Check dumpable flag handling (non-dumpable case)"; const char *test_author = "Filipe Brandenburger "; -int dumpable_server() { +int dumpable_server(void) { char buf[256]; int ret; diff --git a/test/zdtm/static/fdt_shared.c b/test/zdtm/static/fdt_shared.c index 2111356f53..a84444af5e 100644 --- a/test/zdtm/static/fdt_shared.c +++ b/test/zdtm/static/fdt_shared.c @@ -22,7 +22,7 @@ TEST_OPTION(filename, string, "file name", 1); #define CHILDREN 4 static int fork_pfd[2]; -static void forked() +static void forked(void) { char c = 0; @@ -32,7 +32,7 @@ static void forked() } } -static void wait_children() +static void wait_children(void) { int i; char c; diff --git a/test/zdtm/static/file_cloexec.c b/test/zdtm/static/file_cloexec.c new file mode 100644 index 0000000000..b8eba39e54 --- /dev/null +++ b/test/zdtm/static/file_cloexec.c @@ -0,0 +1,63 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check FD_CLOEXEC flag"; +const char *test_author = "Nicolas Viennot "; + +#define err(exitcode, msg, ...) ({ pr_perror(msg, ##__VA_ARGS__); exit(exitcode); }) + +static void assert_fd_flags(int fd, int mask, int value) +{ + int flags = fcntl(fd, F_GETFD); + if (flags == -1) + err(1, "Can't get fd flags"); + + if ((flags & mask) != value) { + fail("fd flags mismatch"); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + int fd1, fd2, fd3, fd4; + + test_init(argc, argv); + + fd1 = open("/", O_RDONLY | O_CLOEXEC); + if (fd1 < 0) + err(1, "Can't open()"); + + fd2 = open("/", O_RDONLY); + if (fd2 < 0) + err(1, "Can't open()"); + + fd3 = dup(fd1); + if (fd3 < 0) + err(1, "Can't dup()"); + + fd4 = fcntl(fd2, F_DUPFD_CLOEXEC, 0); + if (fd4 < 0) + err(1, "Can't dup()"); + + test_daemon(); + test_waitsig(); + + assert_fd_flags(fd1, FD_CLOEXEC, FD_CLOEXEC); + assert_fd_flags(fd2, FD_CLOEXEC, 0); + assert_fd_flags(fd3, FD_CLOEXEC, 0); + assert_fd_flags(fd4, FD_CLOEXEC, FD_CLOEXEC); + + pass(); + + return 0; +} diff --git a/test/zdtm/static/file_locks00.c b/test/zdtm/static/file_locks00.c index 59e19cfe1d..fa98a31b3d 100644 --- a/test/zdtm/static/file_locks00.c +++ b/test/zdtm/static/file_locks00.c @@ -101,7 +101,7 @@ static int check_write_lock(int fd, int whence, off_t offset, off_t len) return -1; } -static int check_file_locks() +static int check_file_locks(void) { int fd_0, fd_1; int ret0, ret1; diff --git a/test/zdtm/static/inotify00.c b/test/zdtm/static/inotify00.c index 67088edd8f..635c050471 100644 --- a/test/zdtm/static/inotify00.c +++ b/test/zdtm/static/inotify00.c @@ -125,9 +125,10 @@ int main (int argc, char *argv[]) { pid_t pid; task_waiter_t t; - task_waiter_init(&t); static char buf[PATH_MAX]; + task_waiter_init(&t); + if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) { pr_perror("Unable to remount /"); return 1; diff --git a/test/zdtm/static/inotify_system.c b/test/zdtm/static/inotify_system.c index 59f47c41c8..3e6b2ad48b 100644 --- a/test/zdtm/static/inotify_system.c +++ b/test/zdtm/static/inotify_system.c @@ -68,7 +68,7 @@ typedef struct { int dir; } desc; -void do_wait() { +void do_wait(void) { test_daemon(); test_waitsig(); } diff --git a/test/zdtm/static/maps00.c b/test/zdtm/static/maps00.c index a6c68cd25b..f2da9b9756 100644 --- a/test/zdtm/static/maps00.c +++ b/test/zdtm/static/maps00.c @@ -123,7 +123,7 @@ static void segfault(int signo) * after test func should be placed check map, because size of test_func * is calculated as (check_map-test_func) */ -int test_func() +int test_func(void) { return 1; } @@ -176,8 +176,9 @@ static int check_map(struct map *map) memcpy(map->ptr,test_func, getpagesize()); } else { if (!(map->flag & MAP_ANONYMOUS)) { + uint8_t funlen = (uint8_t *)check_map - (uint8_t *)test_func; lseek(map->fd,0,SEEK_SET); - if (write(map->fd,test_func,check_map - test_func)fd,test_func,funlen)filename); return -1; } @@ -185,7 +186,7 @@ static int check_map(struct map *map) } if (!(map->flag & MAP_ANONYMOUS) || map->prot & PROT_WRITE) /* Function body has been copied into the mapping */ - ((int (*)())map->ptr)(); /* perform exec access */ + ((int (*)(void))map->ptr)(); /* perform exec access */ else /* No way to copy function body into mapping, * clear exec bit from effective protection diff --git a/test/zdtm/static/maps03.c b/test/zdtm/static/maps03.c index f2bf7957a3..0e0a5b8f25 100644 --- a/test/zdtm/static/maps03.c +++ b/test/zdtm/static/maps03.c @@ -16,9 +16,10 @@ const char *test_author = "Cyrill Gorcunov "; int main(int argc, char **argv) { - test_init(argc, argv); unsigned char *mem; + test_init(argc, argv); + test_msg("Alloc huge VMA\n"); mem = (void *)mmap(NULL, (10L << 30), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); diff --git a/test/zdtm/static/memfd00.c b/test/zdtm/static/memfd00.c new file mode 100644 index 0000000000..6b56eca011 --- /dev/null +++ b/test/zdtm/static/memfd00.c @@ -0,0 +1,103 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "memfd file descriptor"; +const char *test_author = "Nicolas Viennot "; + +#define err(exitcode, msg, ...) ({ pr_perror(msg, ##__VA_ARGS__); exit(exitcode); }) + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + +int main(int argc, char *argv[]) +{ + int fd, fl_flags1, fl_flags2, fd_flags1, fd_flags2; + struct statfs statfs1, statfs2; + off_t pos1, pos2; + char buf[5]; + + test_init(argc, argv); + + fd = _memfd_create("somename", MFD_CLOEXEC); + if (fd < 0) + err(1, "Can't call memfd_create"); + + if (fcntl(fd, F_SETFL, O_APPEND) < 0) + err(1, "Can't get fl flags"); + + if ((fl_flags1 = fcntl(fd, F_GETFL)) == -1) + err(1, "Can't get fl flags"); + + if ((fd_flags1 = fcntl(fd, F_GETFD)) == -1) + err(1, "Can't get fd flags"); + + if (fstatfs(fd, &statfs1) < 0) + err(1, "statfs issue"); + + if (write(fd, "hello", 5) != 5) + err(1, "write error"); + + pos1 = 3; + if (lseek(fd, pos1, SEEK_SET) < 0) + err(1, "seek error"); + + test_daemon(); + test_waitsig(); + + if ((fl_flags2 = fcntl(fd, F_GETFL)) == -1) + err(1, "Can't get fl flags"); + + if (fl_flags1 != fl_flags2) { + fail("fl flags differs"); + return 1; + } + + if ((fd_flags2 = fcntl(fd, F_GETFD)) == -1) + err(1, "Can't get fd flags"); + + if (fd_flags1 != fd_flags2) { + fail("fd flags differs"); + return 1; + } + + if (fstatfs(fd, &statfs2) < 0) + err(1, "statfs issue"); + + if (statfs1.f_type != statfs2.f_type) { + fail("statfs.f_type differs"); + return 1; + } + + pos2 = lseek(fd, 0, SEEK_CUR); + if (pos1 != pos2) { + fail("position differs"); + return 1; + } + + if (pread(fd, buf, sizeof(buf), 0) != sizeof(buf)) { + fail("read problem"); + return 1; + } + + if (memcmp(buf, "hello", sizeof(buf))) { + fail("content mismatch"); + return 1; + } + + pass(); + + return 0; +} diff --git a/test/zdtm/static/memfd01.c b/test/zdtm/static/memfd01.c new file mode 100644 index 0000000000..7a78536422 --- /dev/null +++ b/test/zdtm/static/memfd01.c @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "memfd with different file pointer"; +const char *test_author = "Nicolas Viennot "; + +#define err(exitcode, msg, ...) ({ pr_perror(msg, ##__VA_ARGS__); exit(exitcode); }) + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + +int main(int argc, char *argv[]) +{ + pid_t pid, pid_child; + int fd, ret, status; + task_waiter_t t; + + test_init(argc, argv); + + task_waiter_init(&t); + + fd = _memfd_create("somename", MFD_CLOEXEC); + if (fd < 0) + err(1, "Can't call memfd_create"); + + pid = getpid(); + + pid_child = fork(); + if (pid_child < 0) + err(1, "Can't fork"); + + if (!pid_child) { + char fdpath[100]; + char buf[1]; + int fl_flags1, fl_flags2, fd_flags1, fd_flags2; + + snprintf(fdpath, sizeof(fdpath), "/proc/%d/fd/%d", pid, fd); + /* + * We pass O_LARGEFILE because in compat mode, our file + * descriptor does not get O_LARGEFILE automatically, but the + * restorer using non-compat open() is forced O_LARGEFILE. + * This creates a flag difference, which we don't want to deal + * with this at the moment. + */ + fd = open(fdpath, O_RDONLY | O_LARGEFILE); + if (fd < 0) + err(1, "Can't open memfd via proc"); + + if ((fl_flags1 = fcntl(fd, F_GETFL)) == -1) + err(1, "Can't get fl flags"); + + if ((fd_flags1 = fcntl(fd, F_GETFD)) == -1) + err(1, "Can't get fd flags"); + + task_waiter_complete(&t, 1); + // checkpoint-restore happens here + task_waiter_wait4(&t, 2); + + if (read(fd, buf, 1) != 1) + err(1, "Can't read"); + + if ((fl_flags2 = fcntl(fd, F_GETFL)) == -1) + err(1, "Can't get fl flags"); + + if (fl_flags1 != fl_flags2) + err(1, "fl flags differs"); + + if ((fd_flags2 = fcntl(fd, F_GETFD)) == -1) + err(1, "Can't get fd flags"); + + if (fd_flags1 != fd_flags2) + err(1, "fd flags differs"); + + if (buf[0] != 'x') + err(1, "Read incorrect"); + + return 0; + } + + task_waiter_wait4(&t, 1); + + test_daemon(); + test_waitsig(); + + if (write(fd, "x", 1) != 1) + err(1, "Can't write"); + + task_waiter_complete(&t, 2); + + ret = wait(&status); + if (ret == -1 || !WIFEXITED(status) || WEXITSTATUS(status)) { + kill(pid, SIGKILL); + fail("child had issue"); + return 1; + } + + pass(); + + return 0; +} diff --git a/test/zdtm/static/memfd02.c b/test/zdtm/static/memfd02.c new file mode 100644 index 0000000000..1843e9c9af --- /dev/null +++ b/test/zdtm/static/memfd02.c @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "memfd mmap"; +const char *test_author = "Nicolas Viennot "; + +#define err(exitcode, msg, ...) ({ pr_perror(msg, ##__VA_ARGS__); exit(exitcode); }) + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + +int main(int argc, char *argv[]) +{ +#define LEN 6 + int fd; + void *addr_shared, *addr_private; + char buf[LEN]; + + test_init(argc, argv); + + fd = _memfd_create("somename", MFD_CLOEXEC); + if (fd < 0) + err(1, "Can't call memfd_create"); + + if (ftruncate(fd, LEN) < 0) + err(1, "Can't truncate"); + + addr_shared = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (addr_shared == MAP_FAILED) + err(1, "Can't mmap"); + + write(fd, "write1", LEN); + + addr_private = mmap(NULL, LEN, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (addr_private == MAP_FAILED) + err(1, "Can't mmap"); + + test_daemon(); + test_waitsig(); + + if (memcmp(addr_shared, "write1", LEN)) { + fail("content mismatch (shared)"); + return 1; + } + + strcpy(addr_shared, "write2"); + + if (pread(fd, buf, LEN, 0) != LEN) { + fail("read problem"); + return 1; + } + + if (memcmp(buf, "write2", LEN)) { + fail("content mismatch (shared)"); + return 1; + } + + if (memcmp(addr_private, "write2", LEN)) { + fail("content mismatch (private)"); + return 1; + } + + strcpy(addr_private, "write3"); + + if (memcmp(addr_shared, "write2", LEN)) { + fail("content mismatch (shared)"); + return 1; + } + + pass(); + + return 0; +} diff --git a/test/zdtm/static/memfd03.c b/test/zdtm/static/memfd03.c new file mode 100644 index 0000000000..faedf9383e --- /dev/null +++ b/test/zdtm/static/memfd03.c @@ -0,0 +1,97 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "memfd seals"; +const char *test_author = "Nicolas Viennot "; + +#define err(exitcode, msg, ...) ({ pr_perror(msg, ##__VA_ARGS__); exit(exitcode); }) + +static int _memfd_create(const char *name, unsigned int flags) +{ + return syscall(SYS_memfd_create, name, flags); +} + + +#ifndef F_LINUX_SPECIFIC_BASE +# define F_LINUX_SPECIFIC_BASE 1024 +#endif + +#ifndef F_ADD_SEALS + #define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +#endif + +#ifndef F_GET_SEALS + #define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) +#endif + + +#ifndef F_SEAL_SEAL +#define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +#define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +#define F_SEAL_GROW 0x0004 /* prevent file from growing */ +#define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +int main(int argc, char *argv[]) +{ +#define LEN 5 + int fd, fd2; + void *addr_write, *addr_read; + char fdpath[100]; + + test_init(argc, argv); + + fd = _memfd_create("somename", MFD_ALLOW_SEALING | MFD_CLOEXEC); + if (fd < 0) + err(1, "Can't call memfd_create"); + + if (write(fd, "hello", LEN) != LEN) + err(1, "Can't write"); + + if (fcntl(fd, F_ADD_SEALS, F_SEAL_WRITE) < 0) + err(1, "Can't add seals"); + + test_daemon(); + test_waitsig(); + + snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", fd); + fd2 = open(fdpath, O_RDWR); + if (fd2 < 0) + err(1, "Can't open memfd via proc"); + + if (fcntl(fd, F_GET_SEALS) != F_SEAL_WRITE) { + fail("Seals are different"); + return 1; + } + + addr_write = mmap(NULL, LEN, PROT_WRITE, MAP_SHARED, fd2, 0); + if (addr_write != MAP_FAILED) { + fail("Should not be able to get write access"); + return 1; + } + + addr_read = mmap(NULL, 1, PROT_READ, MAP_PRIVATE, fd2, 0); + if (addr_read == MAP_FAILED) + err(1, "Can't mmap"); + + if (memcmp(addr_read, "hello", LEN)) { + fail("Mapping has bad data"); + return 1; + } + + pass(); + + return 0; +} diff --git a/test/zdtm/static/mnt_ext_dev.c b/test/zdtm/static/mnt_ext_dev.c index a9ac01333b..1d60fc92fe 100644 --- a/test/zdtm/static/mnt_ext_dev.c +++ b/test/zdtm/static/mnt_ext_dev.c @@ -20,10 +20,11 @@ TEST_OPTION(dirname, string, "directory name", 1); int main(int argc, char **argv) { char *loop, fd, dfd, fd2; - test_init(argc, argv); struct stat st, stp, st2; char dname[PATH_MAX], dname2[PATH_MAX]; + test_init(argc, argv); + snprintf(dname, sizeof(dname), "%s/test_dir", dirname); snprintf(dname2, sizeof(dname2), "%s/test_dir2", dirname); diff --git a/test/zdtm/static/mntns_link_remap.c b/test/zdtm/static/mntns_link_remap.c index 642641b161..6ac08191ab 100644 --- a/test/zdtm/static/mntns_link_remap.c +++ b/test/zdtm/static/mntns_link_remap.c @@ -230,8 +230,8 @@ int main(int argc, char **argv) if (pid > 0) { - kill(pid, SIGTERM); int status = 1; + kill(pid, SIGTERM); wait(&status); if (WIFEXITED(status)) { if (WEXITSTATUS(status) == AWK_OK) diff --git a/test/zdtm/static/mntns_open.c b/test/zdtm/static/mntns_open.c index e19c4ea72d..c687080a78 100644 --- a/test/zdtm/static/mntns_open.c +++ b/test/zdtm/static/mntns_open.c @@ -119,8 +119,8 @@ int main(int argc, char **argv) test_waitsig(); if (pid > 0) { - kill(pid, SIGTERM); int status = 1; + kill(pid, SIGTERM); wait(&status); if (WIFEXITED(status)) { if (WEXITSTATUS(status) == AWK_OK) diff --git a/test/zdtm/static/mntns_rw_ro_rw.c b/test/zdtm/static/mntns_rw_ro_rw.c index 7aed254b69..6179c47882 100644 --- a/test/zdtm/static/mntns_rw_ro_rw.c +++ b/test/zdtm/static/mntns_rw_ro_rw.c @@ -31,12 +31,12 @@ int main(int argc, char **argv) test_waitsig(); if (access("/proc/sys/net/ipv4/ip_forward", W_OK)) { - fail("Unable to access /proc/sys/net/core/wmem_max"); + fail("Unable to access /proc/sys/net/ipv4/ip_forward"); return 1; } if (access("/proc/sys/kernel/ns_last_pid", W_OK) != -1 || errno != EROFS) { - fail("Unable to access /proc/sys/kernel/pid_max"); + fail("Unable to access /proc/sys/kernel/ns_last_pid"); return 1; } diff --git a/test/zdtm/static/mountpoints.c b/test/zdtm/static/mountpoints.c index 00475cdc50..cf54d10960 100644 --- a/test/zdtm/static/mountpoints.c +++ b/test/zdtm/static/mountpoints.c @@ -292,8 +292,8 @@ int main(int argc, char **argv) } if (pid > 0) { - kill(pid, SIGTERM); int status = 1; + kill(pid, SIGTERM); wait(&status); if (status) return 1; diff --git a/test/zdtm/static/netns-nft.c b/test/zdtm/static/netns-nft.c new file mode 100644 index 0000000000..f4991afda8 --- /dev/null +++ b/test/zdtm/static/netns-nft.c @@ -0,0 +1,64 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Check that nft rules (some) are kept"; +const char *test_author = "Alexander Mikhalitsyn "; + +char *filename; +TEST_OPTION(filename, string, "file name", 1); + +int main(int argc, char **argv) +{ + char cmd[128]; + + test_init(argc, argv); + + /* create nft table */ + if (system("nft add table inet netns-nft-zdtm-test")) { + pr_perror("Can't create nft table"); + return -1; + } + + /* create input chain in table */ + if (system("nft add chain inet netns-nft-zdtm-test input { type filter hook input priority 0 \\; }")) { + pr_perror("Can't create input chain in nft table"); + return -1; + } + + /* block ICMPv4 traffic */ + if (system("nft add rule inet netns-nft-zdtm-test input meta nfproto ipv4 icmp type { echo-request } reject")) { + pr_perror("Can't set input rule"); + return -1; + } + + /* save resulting nft table */ + sprintf(cmd, "nft list table inet netns-nft-zdtm-test > pre-%s", filename); + if (system(cmd)) { + pr_perror("Can't get nft table"); + return -1; + } + + test_daemon(); + test_waitsig(); + + /* get nft table */ + sprintf(cmd, "nft list table inet netns-nft-zdtm-test > post-%s", filename); + if (system(cmd)) { + fail("Can't get nft table"); + return -1; + } + + /* compare nft table before/after c/r */ + sprintf(cmd, "diff pre-%s post-%s", filename, filename); + if (system(cmd)) { + fail("nft table differ"); + return -1; + } + + pass(); + return 0; +} diff --git a/test/zdtm/static/netns-nft.checkskip b/test/zdtm/static/netns-nft.checkskip new file mode 100755 index 0000000000..270cafeb55 --- /dev/null +++ b/test/zdtm/static/netns-nft.checkskip @@ -0,0 +1,3 @@ +#!/bin/bash + +test -f /usr/sbin/nft || exit 1 diff --git a/test/zdtm/static/netns-nft.desc b/test/zdtm/static/netns-nft.desc new file mode 100644 index 0000000000..f53890a245 --- /dev/null +++ b/test/zdtm/static/netns-nft.desc @@ -0,0 +1,5 @@ +{ 'deps': [ '/bin/sh', + '/usr/sbin/nft', + '/usr/bin/diff'], + 'flags': 'suid', + 'flavor': 'ns uns'} diff --git a/test/zdtm/static/netns_sub_sysctl.c b/test/zdtm/static/netns_sub_sysctl.c new file mode 100644 index 0000000000..bf828e08e2 --- /dev/null +++ b/test/zdtm/static/netns_sub_sysctl.c @@ -0,0 +1,56 @@ +#include + +#include "zdtmtst.h" +#include "sysctl.h" + +const char *test_doc = "Check dump and restore a net.unix.max_dgram_qlen sysctl parameter in subns"; +const char *test_author = "Alexander Mikhalitsyn "; + +typedef struct { + const char *path; + int old; + int new; +} sysctl_opt_t; + +#define CONF_UNIX_BASE "/proc/sys/net/unix" + +static sysctl_opt_t net_unix_params[] = { + {CONF_UNIX_BASE"/max_dgram_qlen", 0, 0}, + {NULL, 0, 0} +}; + +int main(int argc, char **argv) +{ + int ret = 0; + sysctl_opt_t *p; + test_init(argc, argv); + + for (p = net_unix_params; p->path != NULL; p++) { + p->old = (((unsigned)lrand48()) % 1023) + 1; + if (sysctl_write_int(p->path, p->old)) { + pr_perror("Can't change %s", p->path); + return -1; + } + } + + test_daemon(); + test_waitsig(); + + for (p = net_unix_params; p->path != NULL; p++) { + if (sysctl_read_int(p->path, &p->new)) + ret = 1; + + if (p->old != p->new) { + errno = EINVAL; + pr_perror("%s changed: %d ---> %d", p->path, p->old, p->new); + ret = 1; + } + } + + if (ret) + fail(); + else + pass(); + + return ret; +} diff --git a/test/zdtm/static/netns_sub_sysctl.desc b/test/zdtm/static/netns_sub_sysctl.desc new file mode 100644 index 0000000000..5358426683 --- /dev/null +++ b/test/zdtm/static/netns_sub_sysctl.desc @@ -0,0 +1,4 @@ +{ + 'flavor': 'ns', + 'flags': 'suid' +} diff --git a/test/zdtm/static/opath_file.c b/test/zdtm/static/opath_file.c new file mode 100644 index 0000000000..943f4eddb6 --- /dev/null +++ b/test/zdtm/static/opath_file.c @@ -0,0 +1,95 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define TEST_FILE "test_file" +#define BUF_SIZE 4096 +#define fdinfo_field(str, field) !strncmp(str, field":", sizeof(field)) +#define pr_debug(format, arg...) test_msg("DBG: %s:%d: " format, __FILE__, __LINE__, ## arg) + +const char *test_doc = "Check open file with O_PATH preserved"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +struct fdinfo { + int flags; +}; + +static int parse_self_fdinfo(int fd, struct fdinfo *fi) +{ + char path[PATH_MAX], line[BUF_SIZE]; + FILE *file; + int ret = -1; + unsigned long long val; + + snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", fd); + file = fopen(path, "r"); + if (!file) { + pr_perror("fopen"); + return -1; + } + + while (fgets(line, sizeof(line), file)) { + if (fdinfo_field(line, "flags")) { + if (sscanf(line, "%*s %llo", &val) != 1) { + pr_err("failed to read flags: %s", line); + goto fail; + } + pr_debug("Open flags = %llu\n", val); + fi->flags = val; + ret = 0; + break; + } + } +fail: + fclose(file); + return ret; +} + +int main(int argc, char **argv) +{ + char test_file[PATH_MAX]; + struct fdinfo fi; + int fd; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + snprintf(test_file, sizeof(test_file), "%s/%s", dirname, TEST_FILE); + fd = creat(test_file, 0644); + if (fd == -1) { + pr_perror("cat't create %s", test_file); + return 1; + } + close(fd); + + fd = open(test_file, O_PATH); + if (fd == -1) { + pr_perror("cat't open file %s with O_PATH", test_file); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (parse_self_fdinfo(fd, &fi)) + return 1; + + if (!(fi.flags & O_PATH)) { + fail("File lost O_PATH open flag"); + return 1; + } + + close(fd); + pass(); + return 0; +} diff --git a/test/zdtm/static/pipe03.c b/test/zdtm/static/pipe03.c index a8721e934f..d649007b70 100644 --- a/test/zdtm/static/pipe03.c +++ b/test/zdtm/static/pipe03.c @@ -13,27 +13,28 @@ const char *test_author = "Andrei Vagin "; int main(int argc, char **argv) { - int p[2], i; + int p[2][2], i; uint8_t buf[BUF_SIZE]; uint32_t crc; test_init(argc, argv); - if (pipe2(p, O_NONBLOCK)) { - pr_perror("pipe"); - return 1; - } - - if (fcntl(p[1], F_SETPIPE_SZ, DATA_SIZE) == -1) { - pr_perror("Unable to change a pipe size"); - return 1; + for (i = 0; i < 2; i++) { + if (pipe2(p[i], O_NONBLOCK)) { + pr_perror("pipe"); + return 1; + } + if (fcntl(p[i][1], F_SETPIPE_SZ, DATA_SIZE) == -1) { + pr_perror("Unable to change a pipe size"); + return 1; + } } crc = ~0; datagen(buf, BUF_SIZE, &crc); for (i = 0; i < DATA_SIZE / BUF_SIZE; i++) { - if (write(p[1], buf, BUF_SIZE) != BUF_SIZE) { + if (write(p[0][1], buf, BUF_SIZE) != BUF_SIZE) { pr_perror("write"); return 1; } @@ -43,12 +44,26 @@ int main(int argc, char **argv) test_waitsig(); for (i = 0; i < DATA_SIZE / BUF_SIZE; i++) { - if (read(p[0], buf, BUF_SIZE) != BUF_SIZE) { + if (read(p[0][0], buf, BUF_SIZE) != BUF_SIZE) { pr_perror("read"); return 1; } } + for (i = 0; i < 2; i++) { + int size; + + size = fcntl(p[i][1], F_GETPIPE_SZ); + if (size < 0) { + pr_perror("Unable to get a pipe size"); + return 1; + } + if (size != DATA_SIZE) { + fail("%d: size %d expected %d", i, size, DATA_SIZE); + return 1; + } + } + pass(); return 0; } diff --git a/test/zdtm/static/remap_dead_pid.c b/test/zdtm/static/remap_dead_pid.c index 261c591b79..5d4241fc6f 100644 --- a/test/zdtm/static/remap_dead_pid.c +++ b/test/zdtm/static/remap_dead_pid.c @@ -40,12 +40,12 @@ int main(int argc, char **argv) while(1) sleep(10); } else { - test_msg("child is %d\n", pid); - int fd, ret; char path[PATH_MAX]; pid_t result; + test_msg("child is %d\n", pid); + sprintf(path, proc_path, pid); fd = open(path, O_RDONLY); if (fd < 0) { diff --git a/test/zdtm/static/selinux00.c b/test/zdtm/static/selinux00.c index db8420eacb..b5b3e3cc00 100644 --- a/test/zdtm/static/selinux00.c +++ b/test/zdtm/static/selinux00.c @@ -26,14 +26,14 @@ const char *test_author = "Adrian Reber "; */ char state; -int check_for_selinux() +int check_for_selinux(void) { if (access("/sys/fs/selinux", F_OK) == 0) return 0; return 1; } -int setprofile() +int setprofile(void) { int fd, len; @@ -54,7 +54,7 @@ int setprofile() return 0; } -int checkprofile() +int checkprofile(void) { int fd; char context[1024]; @@ -83,7 +83,7 @@ int checkprofile() return 0; } -int check_sockcreate() +int check_sockcreate(void) { char *output = NULL; FILE *f = fopen("/proc/self/attr/sockcreate", "r"); diff --git a/test/zdtm/static/selinux01.c b/test/zdtm/static/selinux01.c index 9966455c47..cbf145d2a0 100644 --- a/test/zdtm/static/selinux01.c +++ b/test/zdtm/static/selinux01.c @@ -28,14 +28,14 @@ const char *test_author = "Adrian Reber "; */ char state; -int check_for_selinux() +int check_for_selinux(void) { if (access("/sys/fs/selinux", F_OK) == 0) return 0; return 1; } -int setprofile() +int setprofile(void) { int fd, len; @@ -56,7 +56,7 @@ int setprofile() return 0; } -int set_sockcreate() +int set_sockcreate(void) { int fd, len; @@ -77,7 +77,7 @@ int set_sockcreate() return 0; } -int check_sockcreate() +int check_sockcreate(void) { int fd; char context[1024]; @@ -106,7 +106,7 @@ int check_sockcreate() return 0; } -int check_sockcreate_empty() +int check_sockcreate_empty(void) { char *output = NULL; FILE *f = fopen("/proc/self/attr/sockcreate", "r"); @@ -133,6 +133,7 @@ int check_sockcreate_empty() int main(int argc, char **argv) { + int sk; char ctx[1024]; test_init(argc, argv); @@ -159,7 +160,7 @@ int main(int argc, char **argv) #endif /* Open our test socket */ - int sk = socket(AF_INET, SOCK_STREAM, 0); + sk = socket(AF_INET, SOCK_STREAM, 0); memset(ctx, 0, 1024); /* Read out the socket label */ if (fgetxattr(sk, "security.selinux", ctx, 1024) == -1) { diff --git a/test/zdtm/static/session02.c b/test/zdtm/static/session02.c index 37f245d2e8..f5c81df161 100644 --- a/test/zdtm/static/session02.c +++ b/test/zdtm/static/session02.c @@ -25,7 +25,7 @@ struct process *processes; int nr_processes = 20; int current = 0; -static void cleanup() +static void cleanup(void) { int i; @@ -55,9 +55,9 @@ struct command int arg2; }; -static void handle_command(); +static void handle_command(void); -static void mainloop() +static void mainloop(void) { while (1) handle_command(); @@ -100,7 +100,7 @@ static int make_child(int id, int flags) return cid; } -static void handle_command() +static void handle_command(void) { int sk = processes[current].sks[0], ret, status = 0; struct command cmd; diff --git a/test/zdtm/static/session03.c b/test/zdtm/static/session03.c index 2b3c46c326..8ca16e4102 100644 --- a/test/zdtm/static/session03.c +++ b/test/zdtm/static/session03.c @@ -36,7 +36,7 @@ static void sigchld_handler(int signal, siginfo_t *siginfo, void *data) waitpid(pid, NULL, WNOHANG); } -static void cleanup() +static void cleanup(void) { int i, ret; @@ -72,7 +72,7 @@ enum commands int cmd_weght[TEST_MAX] = {10, 3, 1, 10, 7}; int sum_weight = 0; -static int get_rnd_op() +static int get_rnd_op(void) { int i, m; if (sum_weight == 0) { @@ -97,9 +97,9 @@ struct command int arg2; }; -static void handle_command(); +static void handle_command(void); -static void mainloop() +static void mainloop(void) { while (1) handle_command(); @@ -142,7 +142,7 @@ static int make_child(int id, int flags) return cid; } -static void handle_command() +static void handle_command(void) { int sk = processes[current].sks[0], ret, status = 0; struct command cmd; diff --git a/test/zdtm/static/shmemfd-priv.c b/test/zdtm/static/shmemfd-priv.c new file mode 100644 index 0000000000..bbdb46905b --- /dev/null +++ b/test/zdtm/static/shmemfd-priv.c @@ -0,0 +1,84 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test C/R of shared memory file descriptors"; +const char *test_author = "Andrei Vagin "; + +#define err(exitcode, msg, ...) ({ pr_perror(msg, ##__VA_ARGS__); exit(exitcode); }) + +int main(int argc, char *argv[]) +{ + void *addr, *priv_addr, *addr2; + char path[4096]; + int fd; + + test_init(argc, argv); + + addr = mmap(NULL, 5 * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + + *(int *) addr = 1; + *(int *) (addr + PAGE_SIZE) = 11; + *(int *) (addr + 2 * PAGE_SIZE) = 111; + + snprintf(path, sizeof(path), "/proc/self/map_files/%lx-%lx", + (long)addr, (long)addr + 5 * PAGE_SIZE); + fd = open(path, O_RDWR | O_LARGEFILE); + if (fd < 0) + err(1, "Can't open %s", path); + + priv_addr = mmap(NULL, 5 * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd, PAGE_SIZE); + if (priv_addr == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + + addr2 = mmap(NULL, 5 * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd, 2 * PAGE_SIZE); + if (addr2 == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + + *(int *) (priv_addr + PAGE_SIZE) = 22; + + test_daemon(); + test_waitsig(); + + if (*(int *) (priv_addr + PAGE_SIZE) != 22) { + fail("the second page of the private mapping is corrupted"); + return 1; + } + if (*(int *) (priv_addr) != 11) { + fail("the first page of the private mapping is corrupted"); + return 1; + } + if (*(int *) (addr2) != 111) { + fail("the first page of the second shared mapping is corrupted"); + return 1; + } + *(int *) (addr2) = 333; + if (*(int *) (addr + 2 * PAGE_SIZE) != 333) { + fail("the first page of the second shared mapping isn't shared"); + return 1; + } + *(int *) (addr + 3 * PAGE_SIZE) = 444; + if (*(int *) (priv_addr + 2 * PAGE_SIZE) != 444) { + fail("the third page of the private mapping is corrupted"); + return 1; + } + + pass(); + + return 0; +} diff --git a/test/zdtm/static/shmemfd-priv.desc b/test/zdtm/static/shmemfd-priv.desc new file mode 100644 index 0000000000..d969725f6d --- /dev/null +++ b/test/zdtm/static/shmemfd-priv.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/test/zdtm/static/shmemfd.c b/test/zdtm/static/shmemfd.c new file mode 100644 index 0000000000..b65faa2e11 --- /dev/null +++ b/test/zdtm/static/shmemfd.c @@ -0,0 +1,107 @@ +#include +#include + +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "Test C/R of shared memory file descriptors"; +const char *test_author = "Andrei Vagin "; + +#define err(exitcode, msg, ...) ({ pr_perror(msg, ##__VA_ARGS__); exit(exitcode); }) + +int main(int argc, char *argv[]) +{ + int fd, fl_flags1, fl_flags2, fd_flags1, fd_flags2; + struct statfs statfs1, statfs2; + off_t pos1, pos2; + char path[4096]; + char buf[5]; + void *addr; + + test_init(argc, argv); + + addr = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); + if (addr == MAP_FAILED) { + pr_perror("mmap"); + return 1; + } + + snprintf(path, sizeof(path), "/proc/self/map_files/%lx-%lx", + (long)addr, (long)addr + PAGE_SIZE); + fd = open(path, O_RDWR | O_LARGEFILE); + if (fd < 0) + err(1, "Can't open %s", path); + ftruncate(fd, 0); + munmap(addr, PAGE_SIZE); + + if (fcntl(fd, F_SETFL, O_APPEND) < 0) + err(1, "Can't get fl flags"); + + if ((fl_flags1 = fcntl(fd, F_GETFL)) == -1) + err(1, "Can't get fl flags"); + + if ((fd_flags1 = fcntl(fd, F_GETFD)) == -1) + err(1, "Can't get fd flags"); + + if (fstatfs(fd, &statfs1) < 0) + err(1, "statfs issue"); + + if (write(fd, "hello", 5) != 5) + err(1, "write error"); + + pos1 = 3; + if (lseek(fd, pos1, SEEK_SET) < 0) + err(1, "seek error"); + + test_daemon(); + test_waitsig(); + + if ((fl_flags2 = fcntl(fd, F_GETFL)) == -1) + err(1, "Can't get fl flags"); + + if (fl_flags1 != fl_flags2) { + fail("fl flags differs %x %x", fl_flags1, fl_flags2); + return 1; + } + + if ((fd_flags2 = fcntl(fd, F_GETFD)) == -1) + err(1, "Can't get fd flags"); + + if (fd_flags1 != fd_flags2) { + fail("fd flags differs"); + return 1; + } + + if (fstatfs(fd, &statfs2) < 0) + err(1, "statfs issue"); + + if (statfs1.f_type != statfs2.f_type) { + fail("statfs.f_type differs"); + return 1; + } + + pos2 = lseek(fd, 0, SEEK_CUR); + if (pos1 != pos2) { + fail("position differs"); + return 1; + } + + if (pread(fd, buf, sizeof(buf), 0) != sizeof(buf)) { + fail("read problem"); + return 1; + } + + if (memcmp(buf, "hello", sizeof(buf))) { + fail("content mismatch"); + return 1; + } + + pass(); + + return 0; +} diff --git a/test/zdtm/static/shmemfd.desc b/test/zdtm/static/shmemfd.desc new file mode 100644 index 0000000000..d969725f6d --- /dev/null +++ b/test/zdtm/static/shmemfd.desc @@ -0,0 +1 @@ +{'flavor': 'h ns', 'flags': 'suid'} diff --git a/test/zdtm/static/sigaltstack.c b/test/zdtm/static/sigaltstack.c index d324b0d378..f36d409f5e 100644 --- a/test/zdtm/static/sigaltstack.c +++ b/test/zdtm/static/sigaltstack.c @@ -61,17 +61,17 @@ void thread_sigaction(int signo, siginfo_t *info, void *context) static void *thread_func(void *arg) { + struct sigaction sa = { + .sa_sigaction = thread_sigaction, + .sa_flags = SA_RESTART | SA_ONSTACK, + }; + sas_state[SAS_THRD_OLD] = (stack_t) { .ss_size = sizeof(stack_thread) - 8, .ss_sp = stack_thread, .ss_flags = 0, }; - struct sigaction sa = { - .sa_sigaction = thread_sigaction, - .sa_flags = SA_RESTART | SA_ONSTACK, - }; - sigemptyset(&sa.sa_mask); if (sigaction(SIGUSR2, &sa, NULL)) { @@ -103,17 +103,17 @@ int main(int argc, char *argv[]) { pthread_t thread; + struct sigaction sa = { + .sa_sigaction = leader_sigaction, + .sa_flags = SA_RESTART | SA_ONSTACK, + }; + sas_state[SAS_MAIN_OLD] = (stack_t) { .ss_size = sizeof(stack_main) - 8, .ss_sp = stack_main, .ss_flags = 0, }; - struct sigaction sa = { - .sa_sigaction = leader_sigaction, - .sa_flags = SA_RESTART | SA_ONSTACK, - }; - sigemptyset(&sa.sa_mask); test_init(argc, argv); diff --git a/test/zdtm/static/sk-unix01.c b/test/zdtm/static/sk-unix01.c index 2bceef79a7..0e9006a152 100644 --- a/test/zdtm/static/sk-unix01.c +++ b/test/zdtm/static/sk-unix01.c @@ -24,22 +24,6 @@ const char *test_author = "Cyrill Gorcunov "; char *dirname; TEST_OPTION(dirname, string, "directory name", 1); -static int fill_sock_name(struct sockaddr_un *name, const char *filename) -{ - char *cwd; - - cwd = get_current_dir_name(); - if (strlen(filename) + strlen(cwd) + 1 >= sizeof(name->sun_path)) { - pr_err("Name %s/%s is too long for socket\n", - cwd, filename); - return -1; - } - - name->sun_family = AF_LOCAL; - ssprintf(name->sun_path, "%s/%s", cwd, filename); - return 0; -} - static int sk_alloc_bind(int type, struct sockaddr_un *addr) { int sk; @@ -155,10 +139,9 @@ int main(int argc, char **argv) */ ssprintf(filename, "%s/%s", subdir_dg, "sk-dt"); - if (fill_sock_name(&addr, filename) < 0) { - pr_err("%s is too long for socket\n", filename); + if (unix_fill_sock_name(&addr, filename)) return 1; - } + unlink(addr.sun_path); sk_dgram[0] = sk_alloc_bind(SOCK_DGRAM, &addr); @@ -184,10 +167,9 @@ int main(int argc, char **argv) test_msg("sk-dt: alloc/connect/unlink %d %s\n", sk_dgram[3], addr.sun_path); ssprintf(filename, "%s/%s", dirname, "sole"); - if (fill_sock_name(&addr, filename) < 0) { - pr_err("%s is too long for socket\n", filename); + if (unix_fill_sock_name(&addr, filename)) return 1; - } + unlink(addr.sun_path); sk_dgram[4] = sk_alloc_bind(SOCK_DGRAM, &addr); @@ -237,7 +219,7 @@ int main(int argc, char **argv) sk_dgram_pair[0], sk_dgram_pair[1]); ssprintf(filename, "%s/%s", subdir_dg, "sk-dtp"); - if (fill_sock_name(&addr, filename) < 0) { + if (unix_fill_sock_name(&addr, filename)) { pr_err("%s is too long for socket\n", filename); return 1; } @@ -270,10 +252,9 @@ int main(int argc, char **argv) * - delete socket on fs */ ssprintf(filename, "%s/%s", subdir_st, "sk-st"); - if (fill_sock_name(&addr, filename) < 0) { - pr_err("%s is too long for socket\n", filename); + if (unix_fill_sock_name(&addr, filename)) return 1; - } + unlink(addr.sun_path); sk_st[0] = sk_alloc_bind(SOCK_STREAM, &addr); diff --git a/test/zdtm/static/socket-tcp-fin-wait1.c b/test/zdtm/static/socket-tcp-fin-wait1.c index 6c7cc93e56..50da9c1528 100644 --- a/test/zdtm/static/socket-tcp-fin-wait1.c +++ b/test/zdtm/static/socket-tcp-fin-wait1.c @@ -141,7 +141,7 @@ int main(int argc, char **argv) return 1; } - if (write(fd, TEST_MSG + 2, sizeof(TEST_MSG) - 2) != sizeof(TEST_MSG) - 2) { + if (write(fd, &TEST_MSG[2], sizeof(TEST_MSG) - 2) != sizeof(TEST_MSG) - 2) { pr_err("write"); return 1; } diff --git a/test/zdtm/static/socket-tcp-keepalive.c b/test/zdtm/static/socket-tcp-keepalive.c new file mode 100644 index 0000000000..a977a03b53 --- /dev/null +++ b/test/zdtm/static/socket-tcp-keepalive.c @@ -0,0 +1,97 @@ +#include +#include +#include +#include + +#include "zdtmtst.h" + +const char *test_doc = "test checkpoint/restore of SO_KEEPALIVE\n"; +const char *test_author = "Radostin Stoyanov \n"; + +int main(int argc, char **argv) +{ + int sk; + int alive = 1; + int cnt = 5; + int idle = 10; + int intvl = 15; + int optval; + socklen_t optlen; + + test_init(argc, argv); + + sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sk < 0) { + pr_perror("Can't create socket"); + return 1; + } + + /* Set the option active */ + if (setsockopt(sk, SOL_SOCKET, SO_KEEPALIVE, &alive, sizeof(alive)) < 0) { + pr_perror("setsockopt SO_KEEPALIVE"); + return 1; + } + + if (setsockopt(sk, SOL_TCP, TCP_KEEPCNT, &cnt, sizeof(cnt)) < 0) { + pr_perror("setsockopt TCP_KEEPCNT"); + return 1; + } + + if (setsockopt(sk, SOL_TCP, TCP_KEEPIDLE, &idle, sizeof(idle)) < 0) { + pr_perror("setsockopt TCP_KEEPIDLE"); + return 1; + } + + optval = 5; + optlen = sizeof(optval); + if (setsockopt(sk, SOL_TCP, TCP_KEEPINTVL, &intvl, sizeof(intvl)) < 0) { + pr_perror("setsockopt TCP_KEEPINTVL"); + return 1; + } + + test_daemon(); + test_waitsig(); + + if (getsockopt(sk, SOL_SOCKET, SO_KEEPALIVE, &optval, &optlen)) { + pr_perror("getsockopt SO_KEEPALIVE"); + return 1; + } + + if (optlen != sizeof(optval) || optval != alive) { + fail("SO_KEEPALIVE not set"); + return 1; + } + + if (getsockopt(sk, SOL_TCP, TCP_KEEPCNT, &optval, &optlen) < 0) { + pr_perror("getsockopt TCP_KEEPCNT"); + return 1; + } + + if (optval != cnt) { + fail("TCP_KEEPCNT has incorrect value (%d != %d)", cnt, optval); + return 1; + } + + if (getsockopt(sk, SOL_TCP, TCP_KEEPIDLE, &optval, &optlen) < 0) { + pr_perror("getsockopt TCP_KEEPIDLE"); + return 1; + } + + if (optval != idle) { + fail("TCP_KEEPIDLE has incorrect value (%d != %d)", idle, optval); + return 1; + } + + if (getsockopt(sk, SOL_TCP, TCP_KEEPINTVL, &optval, &optlen) < 0) { + pr_perror("getsockopt TCP_KEEPINTVL"); + return 1; + } + + if (optval != intvl) { + fail("TCP_KEEPINTVL has incorrect value (%d != %d)", intvl, optval); + return 1; + } + + pass(); + return 0; +} \ No newline at end of file diff --git a/test/zdtm/static/socket-tcp-syn-sent.c b/test/zdtm/static/socket-tcp-syn-sent.c index cf4c3bb46c..755532a8a1 100644 --- a/test/zdtm/static/socket-tcp-syn-sent.c +++ b/test/zdtm/static/socket-tcp-syn-sent.c @@ -37,7 +37,7 @@ int main(int argc, char **argv) { int fd, fd_s, sock, sk; union sockaddr_inet addr; - char cmd[4096]; + char c, cmd[4096]; test_init(argc, argv); @@ -113,7 +113,7 @@ int main(int argc, char **argv) fcntl(sock, F_SETFL, 0); - char c = 5; + c = 5; if (write(sock, &c, 1) != 1) { fail("Unable to send data"); return 1; diff --git a/test/zdtm/static/symlink.c b/test/zdtm/static/symlink.c new file mode 100644 index 0000000000..074c800522 --- /dev/null +++ b/test/zdtm/static/symlink.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include + +#include "zdtmtst.h" + +#define TEST_FILE "test_file" +#define TEST_SYMLINK "test_symlink" + +const char *test_doc = "Check open symlink preserved"; +const char *test_author = "Pavel Tikhomirov "; + +char *dirname; +TEST_OPTION(dirname, string, "directory name", 1); + +int main(int argc, char **argv) +{ + char test_symlink[PATH_MAX]; + char test_file[PATH_MAX]; + char pathbuf[PATH_MAX]; + struct stat stb, sta; + int ret, fd; + + test_init(argc, argv); + + if (mkdir(dirname, 0700)) { + pr_perror("can't make directory %s", dirname); + exit(1); + } + + snprintf(test_file, sizeof(test_file), "%s/%s", dirname, TEST_FILE); + ret = creat(test_file, 0644); + if (ret == -1) { + pr_perror("cat't create %s", test_file); + return 1; + } + close(ret); + + snprintf(test_symlink, sizeof(test_symlink), "%s/%s", dirname, TEST_SYMLINK); + ret = symlink(test_file, test_symlink); + if (ret == -1) { + pr_perror("cat't symlink to %s", test_symlink); + return 1; + } + + fd = open(test_symlink, O_PATH | O_NOFOLLOW); + if (fd == -1) { + pr_perror("cat't open symlink %s", test_symlink); + return 1; + } + + ret = fstat(fd, &sta); + if (ret == -1) { + pr_perror("cat't fstat %s", test_symlink); + return 1; + } + + if (!S_ISLNK(sta.st_mode)) { + pr_perror("file is not symlink %s", test_symlink); + return 1; + } + +#ifdef ZDTM_UNLINK_SYMLINK + if (unlink(test_symlink)) { + pr_perror("can't unlink symlink %s", test_symlink); + return 1; + } +#endif + + test_daemon(); + test_waitsig(); + + ret = fstat(fd, &stb); + if (ret == -1) { + fail("cat't fstat %s", test_symlink); + return 1; + } + + if (!S_ISLNK(stb.st_mode)) { + fail("file is not symlink %s", test_symlink); + return 1; + } + + ret = readlinkat(fd, "", pathbuf, sizeof(pathbuf) - 1); + if (ret < 0) { + fail("Can't readlinkat"); + return 1; + } + pathbuf[ret] = 0; + + if (strcmp(test_file, pathbuf)) { + fail("symlink points to %s but %s expected", pathbuf, test_file); + return 1; + } + + close(fd); + pass(); + return 0; +} diff --git a/test/zdtm/static/symlink01.c b/test/zdtm/static/symlink01.c new file mode 120000 index 0000000000..e2d071ea4c --- /dev/null +++ b/test/zdtm/static/symlink01.c @@ -0,0 +1 @@ +symlink.c \ No newline at end of file diff --git a/test/zdtm/static/unlink_multiple_largefiles.c b/test/zdtm/static/unlink_multiple_largefiles.c index 7cf628606b..2f9248c2f7 100644 --- a/test/zdtm/static/unlink_multiple_largefiles.c +++ b/test/zdtm/static/unlink_multiple_largefiles.c @@ -30,10 +30,11 @@ void create_check_pattern(char *buf, size_t count, unsigned char seed) struct fiemap *read_fiemap(int fd) { - test_msg("Obtaining fiemap for fd %d\n", fd); struct fiemap *fiemap, *tmp; int extents_size; + test_msg("Obtaining fiemap for fd %d\n", fd); + fiemap = malloc(sizeof(struct fiemap)); if (fiemap == NULL) { pr_perror("Cannot allocate fiemap"); diff --git a/test/zdtm/transition/fifo_loop.c b/test/zdtm/transition/fifo_loop.c index 2e28320ba6..b065925867 100644 --- a/test/zdtm/transition/fifo_loop.c +++ b/test/zdtm/transition/fifo_loop.c @@ -39,6 +39,7 @@ int main(int argc, char **argv) int i; uint8_t buf[0x100000]; char *file_path; + int pipe_size; test_init(argc, argv); @@ -83,6 +84,14 @@ int main(int argc, char **argv) ret = errno; return ret; } + + pipe_size = fcntl(writefd, F_SETPIPE_SZ, sizeof(buf)); + if (pipe_size != sizeof(buf)) { + pr_perror("fcntl(writefd, F_SETPIPE_SZ) -> %d", pipe_size); + kill(0, SIGKILL); + exit(1); + } + signal(SIGPIPE, SIG_IGN); if (pipe_in2out(readfd, writefd, buf, sizeof(buf)) < 0) /* pass errno as exit code to the parent */ @@ -104,6 +113,13 @@ int main(int argc, char **argv) exit(1); } + pipe_size = fcntl(writefd, F_SETPIPE_SZ, sizeof(buf)); + if (pipe_size != sizeof(buf)) { + pr_perror("fcntl(writefd, F_SETPIPE_SZ) -> %d", pipe_size); + kill(0, SIGKILL); + exit(1); + } + file_path = path[i - 1]; readfd = open(file_path, O_RDONLY); if (readfd < 0) { @@ -138,13 +154,14 @@ int main(int argc, char **argv) for (p = rbuf, len = wlen; len > 0; p += rlen, len -= rlen) { rlen = read(readfd, p, len); + if (rlen < 0 && errno == EINTR) { + continue; + } + if (rlen <= 0) break; } - if (rlen < 0 && errno == EINTR) - continue; - if (len > 0) { fail("read failed: %m\n"); ret = 1; diff --git a/test/zdtm/transition/file_aio.c b/test/zdtm/transition/file_aio.c index a160101589..4a76c93907 100644 --- a/test/zdtm/transition/file_aio.c +++ b/test/zdtm/transition/file_aio.c @@ -17,7 +17,6 @@ const char *test_author = "Andrew Vagin "; int main(int argc, char **argv) { - test_init(argc, argv); char buf[BUF_SIZE]; int fd; struct aiocb aiocb; @@ -25,6 +24,8 @@ int main(int argc, char **argv) char tmpfname[256]="/tmp/file_aio.XXXXXX"; int ret; + test_init(argc, argv); + fd = mkstemp(tmpfname); if (fd == -1) { pr_perror("mkstemp() failed"); diff --git a/test/zdtm/transition/file_read.c b/test/zdtm/transition/file_read.c index 50dffd8c47..5d6e4dbbac 100644 --- a/test/zdtm/transition/file_read.c +++ b/test/zdtm/transition/file_read.c @@ -158,9 +158,11 @@ static void chew_some_file(int num) rv = SEEK_FAILED; goto out_exit; case 1: - rv = FILE_CORRUPTED; + { int fd1; char str[PATH_MAX]; + + rv = FILE_CORRUPTED; // create standard file sprintf(str, "standard_%s.%d", filename, num); fd1 = open(str, O_WRONLY | O_CREAT | O_TRUNC, 0666); @@ -168,6 +170,7 @@ static void chew_some_file(int num) pr_perror("can't write %s", str); close(fd1); goto out_exit; + } } } rv = SUCCESS; diff --git a/test/zdtm/transition/maps008.c b/test/zdtm/transition/maps008.c index 5f6eb0887d..7ed7c10a5c 100644 --- a/test/zdtm/transition/maps008.c +++ b/test/zdtm/transition/maps008.c @@ -348,6 +348,7 @@ static int proc11_func(task_waiter_t *setup_waiter) void *mem3_old = mem3; size_t mem3_size_old = mem3_size; uint32_t crc_epoch = 0; + uint8_t *proc1_mem3; pstree->proc11 = getpid(); xmunmap(mem3, MEM3_START_CUT); @@ -382,7 +383,7 @@ static int proc11_func(task_waiter_t *setup_waiter) chk_proc_mem_eq(pstree->proc11, mem3, mem3_size, pstree->proc112, mem3, mem3_size + MEM3_END_CUT); - uint8_t *proc1_mem3 = mmap_proc_mem(pstree->proc1, + proc1_mem3 = mmap_proc_mem(pstree->proc1, (unsigned long)mem3_old, mem3_size_old); check_mem_eq(mem3, mem3_size, proc1_mem3 + MEM3_START_CUT, mem3_size); xmunmap(proc1_mem3, mem3_size_old); @@ -489,16 +490,17 @@ static void sigchld_hand(int signo, siginfo_t *info, void *ucontext) int main(int argc, char **argv) { - test_init(argc, argv); - - pstree = (struct pstree *)mmap_ashmem(PAGE_SIZE); - test_sync = (struct test_sync *)mmap_ashmem(sizeof(*test_sync)); - struct sigaction sa = { .sa_sigaction = sigchld_hand, .sa_flags = SA_RESTART | SA_SIGINFO | SA_NOCLDSTOP }; sigemptyset(&sa.sa_mask); + + test_init(argc, argv); + + pstree = (struct pstree *)mmap_ashmem(PAGE_SIZE); + test_sync = (struct test_sync *)mmap_ashmem(sizeof(*test_sync)); + if (sigaction(SIGCHLD, &sa, NULL)) { pr_perror("SIGCHLD handler setup"); exit(1); diff --git a/test/zdtm/transition/netlink00.c b/test/zdtm/transition/netlink00.c index c9b2303e81..3504a48a12 100644 --- a/test/zdtm/transition/netlink00.c +++ b/test/zdtm/transition/netlink00.c @@ -56,12 +56,12 @@ struct rtmsg *rtp; int rtl; struct rtattr *rtap; -int send_request(); -int recv_reply(); -int form_request_add(); -int form_request_del(); -int read_reply(); -typedef int (*cmd_t)(); +int send_request(void); +int recv_reply(void); +int form_request_add(void); +int form_request_del(void); +int read_reply(void); +typedef int (*cmd_t)(void); #define CMD_NUM 2 cmd_t cmd[CMD_NUM]={form_request_add, form_request_del}; @@ -120,7 +120,7 @@ int main(int argc, char *argv[]) return 0; } -int send_request() +int send_request(void) { // create the remote address // to communicate @@ -145,7 +145,7 @@ int send_request() } return 0; } -int recv_reply() +int recv_reply(void) { char *p; // initialize the socket read buffer @@ -191,7 +191,7 @@ int recv_reply() return 0; } -int read_reply() +int read_reply(void) { //string to hold content of the route // table (i.e. one entry) @@ -250,7 +250,7 @@ int read_reply() #define NLMSG_TAIL(nmsg) \ ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) -int form_request_del() +int form_request_del(void) { bzero(&req, sizeof(req)); req.nl.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); @@ -272,7 +272,7 @@ int form_request_del() return 0; } -int form_request_add() +int form_request_add(void) { int ifcn = 1; //interface number