Skip to content

Commit

Permalink
Initial support for bhyve save and restore.
Browse files Browse the repository at this point in the history
Save and restore (also known as suspend and resume) permits a snapshot
to be taken of a guest's state that can later be resumed.  In the
current implementation, bhyve(8) creates a UNIX domain socket that is
used by bhyvectl(8) to send a request to save a snapshot (and
optionally exit after the snapshot has been taken).  A snapshot
currently consists of two files: the first holds a copy of guest RAM,
and the second file holds other guest state such as vCPU register
values and device model state.

To resume a guest, bhyve(8) must be started with a matching pair of
command line arguments to instantiate the same set of device models as
well as a pointer to the saved snapshot.

While the current implementation is useful for several uses cases, it
has a few limitations.  The file format for saving the guest state is
tied to the ABI of internal bhyve structures and is not
self-describing (in that it does not communicate the set of device
models present in the system).  In addition, the state saved for some
device models closely matches the internal data structures which might
prove a challenge for compatibility of snapshot files across a range
of bhyve versions.  The file format also does not currently support
versioning of individual chunks of state.  As a result, the current
file format is not a fixed binary format and future revisions to save
and restore will break binary compatiblity of snapshot files.  The
goal is to move to a more flexible format that adds versioning,
etc. and at that point to commit to providing a reasonable level of
compatibility.  As a result, the current implementation is not enabled
by default.  It can be enabled via the WITH_BHYVE_SNAPSHOT=yes option
for userland builds, and the kernel option BHYVE_SHAPSHOT.

Submitted by:	Mihai Tiganus, Flavius Anton, Darius Mihai
Submitted by:	Elena Mihailescu, Mihai Carabas, Sergiu Weisz
Relnotes:	yes
Sponsored by:	University Politehnica of Bucharest
Sponsored by:	Matthew Grooms (student scholarships)
Sponsored by:	iXsystems
Differential Revision:	https://reviews.freebsd.org/D19495
  • Loading branch information
bsdjhb committed May 5, 2020
1 parent 51a5392 commit 483d953
Show file tree
Hide file tree
Showing 71 changed files with 5,616 additions and 49 deletions.
65 changes: 65 additions & 0 deletions lib/libvmmapi/vmmapi.c
Expand Up @@ -44,6 +44,7 @@ __FBSDID("$FreeBSD$");
#include <machine/specialreg.h>

#include <errno.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
Expand All @@ -53,8 +54,10 @@ __FBSDID("$FreeBSD$");

#include <libutil.h>

#include <vm/vm.h>
#include <machine/vmm.h>
#include <machine/vmm_dev.h>
#include <machine/vmm_snapshot.h>

#include "vmmapi.h"

Expand Down Expand Up @@ -237,6 +240,17 @@ vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
return (error);
}

int
vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
size_t *lowmem_size, size_t *highmem_size)
{

*guest_baseaddr = ctx->baseaddr;
*lowmem_size = ctx->lowmem;
*highmem_size = ctx->highmem;
return (0);
}

int
vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
Expand Down Expand Up @@ -448,6 +462,34 @@ vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
return (NULL);
}

vm_paddr_t
vm_rev_map_gpa(struct vmctx *ctx, void *addr)
{
vm_paddr_t offaddr;

offaddr = (char *)addr - ctx->baseaddr;

if (ctx->lowmem > 0)
if (offaddr >= 0 && offaddr <= ctx->lowmem)
return (offaddr);

if (ctx->highmem > 0)
if (offaddr >= 4*GB && offaddr < 4*GB + ctx->highmem)
return (offaddr);

return ((vm_paddr_t)-1);
}

/* TODO: maximum size for vmname */
int
vm_get_name(struct vmctx *ctx, char *buf, size_t max_len)
{

if (strlcpy(buf, ctx->name, max_len) >= max_len)
return (EINVAL);
return (0);
}

size_t
vm_get_lowmem_size(struct vmctx *ctx)
{
Expand Down Expand Up @@ -1501,6 +1543,29 @@ vm_restart_instruction(void *arg, int vcpu)
return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
}

int
vm_snapshot_req(struct vm_snapshot_meta *meta)
{

if (ioctl(meta->ctx->fd, VM_SNAPSHOT_REQ, meta) == -1) {
#ifdef SNAPSHOT_DEBUG
fprintf(stderr, "%s: snapshot failed for %s: %d\r\n",
__func__, meta->dev_name, errno);
#endif
return (-1);
}
return (0);
}

int
vm_restore_time(struct vmctx *ctx)
{
int dummy;

dummy = 0;
return (ioctl(ctx->fd, VM_RESTORE_TIME, &dummy));
}

int
vm_set_topology(struct vmctx *ctx,
uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus)
Expand Down
29 changes: 29 additions & 0 deletions lib/libvmmapi/vmmapi.h
Expand Up @@ -33,6 +33,7 @@

#include <sys/param.h>
#include <sys/cpuset.h>
#include <machine/vmm_dev.h>

/*
* API version for out-of-tree consumers like grub-bhyve for making compile
Expand All @@ -42,6 +43,7 @@

struct iovec;
struct vmctx;
struct vm_snapshot_meta;
enum x2apic_state;

/*
Expand Down Expand Up @@ -88,6 +90,10 @@ int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
*/
int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);

int vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
size_t *lowmem_size, size_t *highmem_size);

/*
* Create a device memory segment identified by 'segid'.
*
Expand All @@ -110,6 +116,8 @@ void vm_destroy(struct vmctx *ctx);
int vm_parse_memsize(const char *optarg, size_t *memsize);
int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
/* inverse operation to vm_map_gpa - extract guest address from host pointer */
vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr);
int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
uint64_t gla, int prot, uint64_t *gpa, int *fault);
Expand All @@ -120,6 +128,7 @@ uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
void vm_set_memflags(struct vmctx *ctx, int flags);
int vm_get_memflags(struct vmctx *ctx);
int vm_get_name(struct vmctx *ctx, char *buffer, size_t max_len);
size_t vm_get_lowmem_size(struct vmctx *ctx);
size_t vm_get_highmem_size(struct vmctx *ctx);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
Expand Down Expand Up @@ -237,4 +246,24 @@ int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
uint32_t eip, uint32_t gdtbase,
uint32_t esp);
void vm_setup_freebsd_gdt(uint64_t *gdtr);

/*
* Save and restore
*/

#define MAX_SNAPSHOT_VMNAME 100

enum checkpoint_opcodes {
START_CHECKPOINT = 0,
START_SUSPEND = 1,
};

struct checkpoint_op {
unsigned int op;
char snapshot_filename[MAX_SNAPSHOT_VMNAME];
};

int vm_snapshot_req(struct vm_snapshot_meta *meta);
int vm_restore_time(struct vmctx *ctx);

#endif /* _VMMAPI_H_ */
9 changes: 8 additions & 1 deletion share/man/man5/src.conf.5
@@ -1,6 +1,6 @@
.\" DO NOT EDIT-- this file is @generated by tools/build/options/makeman.
.\" $FreeBSD$
.Dd April 30, 2020
.Dd May 4, 2020
.Dt SRC.CONF 5
.Os
.Sh NAME
Expand Down Expand Up @@ -170,6 +170,13 @@ Set to not build or install
associated utilities, and examples.
.Pp
This option only affects amd64/amd64.
.It Va WITH_BHYVE_SNAPSHOT
Set to include support for save and restore (snapshots) in
.Xr bhyve 8
and
.Xr bhyvectl 8 .
.Pp
This option only affects amd64/amd64.
.It Va WITH_BIND_NOW
Build all binaries with the
.Dv DF_BIND_NOW
Expand Down
1 change: 1 addition & 0 deletions share/mk/src.opts.mk
Expand Up @@ -200,6 +200,7 @@ __DEFAULT_YES_OPTIONS = \

__DEFAULT_NO_OPTIONS = \
BEARSSL \
BHYVE_SNAPSHOT \
BSD_GREP \
CLANG_EXTRAS \
DTRACE_TESTS \
Expand Down
24 changes: 24 additions & 0 deletions sys/amd64/include/vmm.h
Expand Up @@ -34,6 +34,8 @@
#include <sys/sdt.h>
#include <x86/segments.h>

struct vm_snapshot_meta;

#ifdef _KERNEL
SDT_PROVIDER_DECLARE(vmm);
#endif
Expand Down Expand Up @@ -152,6 +154,7 @@ struct vmspace;
struct vm_object;
struct vm_guest_paging;
struct pmap;
enum snapshot_req;

struct vm_eventinfo {
void *rptr; /* rendezvous cookie */
Expand Down Expand Up @@ -180,6 +183,10 @@ typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
typedef void (*vmi_vmspace_free)(struct vmspace *vmspace);
typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
typedef int (*vmi_snapshot_t)(void *vmi, struct vm_snapshot_meta *meta);
typedef int (*vmi_snapshot_vmcx_t)(void *vmi, struct vm_snapshot_meta *meta,
int vcpu);
typedef int (*vmi_restore_tsc_t)(void *vmi, int vcpuid, uint64_t now);

struct vmm_ops {
vmm_init_func_t init; /* module wide initialization */
Expand All @@ -199,6 +206,11 @@ struct vmm_ops {
vmi_vmspace_free vmspace_free;
vmi_vlapic_init vlapic_init;
vmi_vlapic_cleanup vlapic_cleanup;

/* checkpoint operations */
vmi_snapshot_t vmsnapshot;
vmi_snapshot_vmcx_t vmcx_snapshot;
vmi_restore_tsc_t vm_restore_tsc;
};

extern struct vmm_ops vmm_ops_intel;
Expand Down Expand Up @@ -272,6 +284,9 @@ void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
int vm_snapshot_req(struct vm *vm, struct vm_snapshot_meta *meta);
int vm_restore_time(struct vm *vm);


#ifdef _SYS__CPUSET_H_
/*
Expand Down Expand Up @@ -409,6 +424,15 @@ int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);

int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);

/*
* Function used to keep track of the guest's TSC offset. The
* offset is used by the virutalization extensions to provide a consistent
* value for the Time Stamp Counter to the guest.
*
* Return value is 0 on success and non-zero on failure.
*/
int vm_set_tsc_offset(struct vm *vm, int vcpu_id, uint64_t offset);

enum vm_reg_name vm_segment_name(int seg_encoding);

struct vm_copyinfo {
Expand Down
11 changes: 11 additions & 0 deletions sys/amd64/include/vmm_dev.h
Expand Up @@ -31,6 +31,8 @@
#ifndef _VMM_DEV_H_
#define _VMM_DEV_H_

struct vm_snapshot_meta;

#ifdef _KERNEL
void vmmdev_init(void);
int vmmdev_cleanup(void);
Expand Down Expand Up @@ -312,6 +314,11 @@ enum {
IOCNUM_RTC_WRITE = 101,
IOCNUM_RTC_SETTIME = 102,
IOCNUM_RTC_GETTIME = 103,

/* checkpoint */
IOCNUM_SNAPSHOT_REQ = 113,

IOCNUM_RESTORE_TIME = 115
};

#define VM_RUN \
Expand Down Expand Up @@ -422,4 +429,8 @@ enum {
_IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time)
#define VM_RESTART_INSTRUCTION \
_IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
#define VM_SNAPSHOT_REQ \
_IOWR('v', IOCNUM_SNAPSHOT_REQ, struct vm_snapshot_meta)
#define VM_RESTORE_TIME \
_IOWR('v', IOCNUM_RESTORE_TIME, int)
#endif

0 comments on commit 483d953

Please sign in to comment.