Skip to content

Commit

Permalink
Implement basic automatic KVM postcopy migration (#1262)
Browse files Browse the repository at this point in the history
This commit adds postcopy functionality to ganeti by automatically
switching a migration to postcopy mode if the postcopy-ram or
x-postcopy-ram migration capability flag is set in an instance's
hypervisor parameters.

Signed-off-by: Calum Calder <calumcalder@google.com>
Reviewed-by: Federico Morg Pareschi <morg@google.com>
  • Loading branch information
calumcalder authored and Morgawr committed Aug 1, 2017
1 parent 5fbc45c commit 041ed56
Show file tree
Hide file tree
Showing 3 changed files with 199 additions and 6 deletions.
48 changes: 46 additions & 2 deletions lib/hypervisor/hv_kvm/__init__.py
Expand Up @@ -538,12 +538,14 @@ class KVMHypervisor(hv_base.BaseHypervisor):
_VIRTIO_NET_PCI = "virtio-net-pci"
_VIRTIO_BLK_PCI = "virtio-blk-pci"

_MIGRATION_STATUS_RE = re.compile(r"Migration\s+status:\s+(\w+)",
_MIGRATION_STATUS_RE = re.compile(r"Migration\s+status:\s+([-\w]+)",
re.M | re.I)
_MIGRATION_PROGRESS_RE = \
re.compile(r"\s*transferred\s+ram:\s+(?P<transferred>\d+)\s+kbytes\s*\n.*"
r"\s*remaining\s+ram:\s+(?P<remaining>\d+)\s+kbytes\s*\n"
r"\s*total\s+ram:\s+(?P<total>\d+)\s+kbytes\s*\n", re.I)
_MIGRATION_PRECOPY_PASSES_RE = \
re.compile(r"\s*dirty sync count:\s+(\d+)", re.I | re.M)

_MIGRATION_INFO_MAX_BAD_ANSWERS = 5
_MIGRATION_INFO_RETRY_DELAY = 2
Expand Down Expand Up @@ -2485,13 +2487,55 @@ def MigrateInstance(self, cluster_name, instance, target, live_migration):

migration_caps = instance.hvparams[constants.HV_KVM_MIGRATION_CAPS]
if migration_caps:
for c in migration_caps.split(_MIGRATION_CAPS_DELIM):
capabilities = migration_caps.split(_MIGRATION_CAPS_DELIM)
postcopy_enabled = ('x-postcopy-ram' in capabilities
or 'postcopy-ram' in capabilities)
for c in capabilities:
migrate_command = ("migrate_set_capability %s on" % c)
self._CallMonitorCommand(instance_name, migrate_command)
else:
postcopy_enabled = False

migrate_command = "migrate -d tcp:%s:%s" % (target, port)
self._CallMonitorCommand(instance_name, migrate_command)

if postcopy_enabled:
self._PostcopyAfterPrecopy(instance)

def _PostcopyAfterPrecopy(self, instance):
"""Enable postcopying RAM after one precopy pass.
Requires that an instance is currently migrating, and that the
postcopy-ram (x-postcopy-ram on QEMU version 2.5 and below)
migration capability is enabled in the instance's hypervisor
parameters.
@type instance: L{objects.Instance}
@param instance: The instance being migrated.
"""
precopy_passes = 0
while precopy_passes < 2:
migration_status = \
self._CallMonitorCommand(instance.name, 'info migrate')

status_match = self._MIGRATION_STATUS_RE.search(migration_status.stdout)
if status_match and status_match.group(1) != 'active':
logging.debug('Did not attempt postcopy, migration status: %s'
% status_match.group(1))
break
if migration_status.stderr:
logging.debug('Error polling for dirty sync count in '
'hv_kvm._PostcopyAfterPrecopy(): %s' % migration_status.stderr)
break

passes_match = \
self._MIGRATION_PRECOPY_PASSES_RE.search(migration_status.stdout)
if passes_match:
precopy_passes = int(passes_match.group(1))
else:
self._CallMonitorCommand(instance.name, 'migrate_start_postcopy')

def FinalizeMigrationSource(self, instance, success, _):
"""Finalize the instance migration on the source node.
Expand Down
12 changes: 8 additions & 4 deletions man/gnt-instance.rst
Expand Up @@ -866,10 +866,14 @@ migration\_caps

Enable specific migration capabilities by providing a ":" separated
list of supported capabilites. QEMU version 1.7.0 defines
x-rdma-pin-all, auto-converge, zero-blocks, and xbzrle. Please note
that while a combination of xbzrle and auto-converge might speed up
the migration process significantly, the first may cause BSOD on
Windows8r2 instances running on drbd.
x-rdma-pin-all, auto-converge, zero-blocks, and xbzrle. QEMU version
2.5 defines x-postcopy-ram and 2.6 renames this to postcopy-ram.
If x-postcopy-ram or postcopy-ram are enabled, Ganeti will
automatically move a migration to postcopy mode after one iteration
of precopying the instance's RAM.
Please note that while a combination of xbzrle and auto-converge
might speed up the migration process significantly, the first may
cause BSOD on Windows8r2 instances running on drbd.

kvm\_path
Valid for the KVM hypervisor.
Expand Down
145 changes: 145 additions & 0 deletions test/py/ganeti.hypervisor.hv_kvm_unittest.py
Expand Up @@ -704,5 +704,150 @@ def get_mock_process(unused_pid):
self.assertEqual(mock_process.set_cpu_affinity.call_args_list[1],
mock.call([4]))

class TestPostcopyAfterPrecopy(testutils.GanetiTestCase):
def setUp(self):
super(TestPostcopyAfterPrecopy, self).setUp()
kvm_class = 'ganeti.hypervisor.hv_kvm.KVMHypervisor'
self.MockOut('qmp', mock.patch('ganeti.hypervisor.hv_kvm.QmpConnection'))
self.MockOut('run_cmd', mock.patch('ganeti.utils.RunCmd'))
self.MockOut('ensure_dirs', mock.patch('ganeti.utils.EnsureDirs'))
self.MockOut('write_file', mock.patch('ganeti.utils.WriteFile'))
self.params = constants.HVC_DEFAULTS[constants.HT_KVM].copy()

def _TestPostcopyAfterPrecopy(self, runcmd, postcopy_started_goal):
hypervisor = hv_kvm.KVMHypervisor()
self.iteration = 0
self.postcopy_started = False

def runcmd_mock(cmd, env=None, output=None, cwd="/", reset_env=False,
interactive=False, timeout=None, noclose_fds=None,
input_fd=None, postfork_fn=None):
res = utils.RunResult(0, None, '', '', cmd, None, None)
if not self.postcopy_started and cmd.find('migrate_start_postcopy') != -1:
self.postcopy_started = True
res.stdout = ('migrate_postcopy_start\n'
'(qemu) ')
return runcmd(cmd, res)

with mock.patch('ganeti.utils.RunCmd', runcmd_mock):
instance = mock.MagicMock()
instance.name = 'example.instance'
hypervisor._PostcopyAfterPrecopy(instance)
self.assertEqual(self.postcopy_started, postcopy_started_goal)

def testNormal(self):
def runcmd_normal(cmd, res):
res = utils.RunResult(0, None, '', '', cmd, None, None)
if cmd.find('info migrate') != -1:
self.iteration += 1
res.stdout = (
'QEMU 2.5.0 monitor - type \'help\' for more information\n'
'(qemu) info migrate\n'
'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
'Migration status: active\n'
'skipped: 0 pages\n'
'dirty sync count: %i\n'
'(qemu) ' % self.iteration
)
return res

self._TestPostcopyAfterPrecopy(runcmd_normal, True)

def testEmptyResponses(self):
def runcmd_empty_responses(cmd, res):
res = utils.RunResult(0, None, '', '', cmd, None, None)
if cmd.find('info migrate') != -1:
self.iteration += 1
if self.iteration < 3:
res.stdout = (
'QEMU 2.5.0 monitor - type \'help\' for more information\n'
'(qemu) info migrate\n'
'(qemu) '
)
else:
res.stdout = (
'QEMU 2.5.0 monitor - type \'help\' for more information\n'
'(qemu) info migrate\n'
'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
'Migration status: active\n'
'skipped: 0 pages\n'
'dirty sync count: %i\n'
'(qemu) ' % self.iteration
)
return res
self._TestPostcopyAfterPrecopy(runcmd_empty_responses, True)

def testMonitorRemoved(self):
def runcmd_monitor_removed(cmd, res):
res = utils.RunResult(0, None, '', '', cmd, None, None)
if cmd.find('info migrate') != -1:
self.iteration += 1
if self.iteration < 3:
res.stdout = (
'QEMU 2.5.0 monitor - type \'help\' for more information\n'
'(qemu) info migrate\n'
'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
'Migration status: active\n'
'skipped: 0 pages\n'
'dirty sync count: %i\n'
'(qemu) '
)
else:
res.stderr = ('2017/07/26 15:49:52 socat[105703] E connect(3, AF=1 '
'"/var/run/ganeti/kvm-hypervisor/ctrl/example.instanc'
'e.monitor", 85): No such file or directory')
return res
self._TestPostcopyAfterPrecopy(runcmd_monitor_removed, False)

def testMigrationFailed(self):
def runcmd_migration_failed(cmd, res):
res = utils.RunResult(0, None, '', '', cmd, None, None)
if cmd.find('info migrate') != -1:
self.iteration += 1
if self.iteration < 3:
res.stdout = (
'QEMU 2.5.0 monitor - type \'help\' for more information\n'
'(qemu) info migrate\n'
'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
'Migration status: active\n'
'skipped: 0 pages\n'
'dirty sync count: %i\n'
'(qemu) '
)
else:
res.stdout = (
'QEMU 2.5.0 monitor - type \'help\' for more information\n'
'(qemu) info migrate\n'
'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
'Migration status: failed\n'
'skipped: 0 pages\n'
'dirty sync count: %i\n'
'(qemu) '
)
return res
self._TestPostcopyAfterPrecopy(runcmd_migration_failed, False)

def testAlreadyInPostcopy(self):
def runcmd_already_in_postcopy(cmd, res):
res = utils.RunResult(0, None, '', '', cmd, None, None)
if cmd.find('info migrate') != -1:
res.stdout = (
'QEMU 2.5.0 monitor - type \'help\' for more information\n'
'(qemu) info migrate\n'
'capabilities: xbzrle: off rdma-pin-all: off auto-converge: on'
'zero-blocks: off compress: off events: off x-postcopy-ram: on \n'
'Migration status: postcopy-active\n'
'skipped: 0 pages\n'
'dirty sync count: %i\n'
'(qemu) '
)
return res
self._TestPostcopyAfterPrecopy(runcmd_already_in_postcopy, False)

if __name__ == "__main__":
testutils.GanetiTestProgram()

0 comments on commit 041ed56

Please sign in to comment.