Skip to content

Commit

Permalink
WIP Swarm Scale improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
natefoo committed Jul 9, 2018
1 parent bd52ea9 commit 50602ce
Show file tree
Hide file tree
Showing 18 changed files with 565 additions and 244 deletions.
13 changes: 13 additions & 0 deletions roles/usegalaxy_swarmscale/files/_inc_fail.yml
@@ -0,0 +1,13 @@
---

- name: Destroy instance on failure
os_server:
cloud: "{{ cloud_id }}"
name: "{{ inventory_hostname }}"
state: absent
delegate_to: localhost
become: no

- name: Fail due to previous failure
fail:
msg: failed
7 changes: 7 additions & 0 deletions roles/usegalaxy_swarmscale/files/ansible.cfg
@@ -1,5 +1,12 @@
[defaults]
inventory = inventory
remote_user = centos
private_key_file = swarm_kp.pem
retry_files_enabled = false
transport = ssh

[inventory]
enable_plugins = host_list, ini, openstack

[ssh_connection]
ssh_args = -o BatchMode=yes -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no
40 changes: 0 additions & 40 deletions roles/usegalaxy_swarmscale/files/delete.yml

This file was deleted.

12 changes: 12 additions & 0 deletions roles/usegalaxy_swarmscale/files/group_vars/all/spawn.yml
@@ -0,0 +1,12 @@
---

image: JS-API-Featured-CentOS7-May-11-2018
key_name: swarm_kp
nics: net-name=swarm
security_groups: default,swarm_worker
auto_ip: yes
flavor: m1.medium

# don't auto-start docker, this will be done by Ansible on spawn/resume once the hostname is set
docker_service_enabled: false
docker_swarm_manager: jetstream-iu0.galaxyproject.org:2377
70 changes: 48 additions & 22 deletions roles/usegalaxy_swarmscale/files/instance
Expand Up @@ -2,6 +2,7 @@
from __future__ import print_function

import argparse
import json
import math
import os
import shutil
Expand All @@ -11,25 +12,41 @@ import tempfile
import uuid


NODE_BASENAME = 'swarm-'
SLOT_SIZE = 0.25
NODE_BASENAME = 'jetstream-iu-swarm'
SLOT_SIZE = 1.0
MAX_NODES = 8
CLOUD_ID = 'jetstream_iu'
# TODO: load balance
DOCKER_CMD = ['docker', '--host', 'tcp://jetstream-tacc0.galaxyproject.org:2376', '--tlsverify']


def gen_node_names(nodect):
def get_node_names(nodect):
inv = json.loads(run_cmd(['ansible-inventory', '--list'], output=True))
hostvars = inv['_meta']['hostvars']
names = []
for i in range(0, nodect):
names.append('{base}{unique}'.format(
base=NODE_BASENAME,
unique=uuid.uuid4().hex))
for i in range(0, MAX_NODES):
if len(names) == nodect:
break
host = NODE_BASEAME + str(i)
state = hostvars.get(host, {}).get('status', 'NOT_EXIST')
print('host: %s, state: %s' % (host, state))
if state in ('NOT_EXIST', 'SHUTOFF'):
names.append(host)
else:
print('WARNING: not enough nodes available for %s nodes, raise MAX_NODES? Returning: %s' % (nodect, names))
return names


def get_worker_join_token:
return run_cmd(DOCKER_CMD + ['swarm', 'join-token', '--quiet', 'worker'], output=True).strip()


def create_inventory(node_names):
if not os.path.exists('inventory'):
shutil.copy('inventory.head', 'inventory')
inventory = tempfile.NamedTemporaryFile(prefix='inventory.scale_spawn', dir=os.getcwd())
with open('inventory.spawn') as head:
inventory.write(head.read())
inventory = tempfile.NamedTemporaryFile(prefix='spawn.%s.' % os.getpid(), dir=os.path.join(os.getcwd(), 'inventory'))
join_token = get_worker_join_token()
inventory.write('[all:vars]\n')
inventory.write('swarm_worker_join_token = %s\n' % join_token)
inventory.write('[swarmnodes]\n')
for name in node_names:
inventory.write(name)
inventory.write('\n')
Expand All @@ -46,15 +63,18 @@ def get_env():
return env


def run_cmd(cmd):
def run_cmd(cmd, output=False):
print('Running %s' % ' '.join(cmd), file=sys.stderr)
subprocess.check_call(cmd, stdout=sys.stderr, env=get_env())
if output:
subprocess.check_output(cmd, env=get_env())
else:
subprocess.check_call(cmd, stdout=sys.stderr, env=get_env())


def run_playbook(playbook, inventory, args=None):
def run_playbook(playbook, args=None):
if not args:
args = []
cmd = ['ansible-playbook', '-i', inventory, playbook] + args
cmd = ['ansible-playbook', playbook] + args
run_cmd(cmd)


Expand All @@ -63,25 +83,31 @@ def spawn(slots):
print('Need %s nodes' % nodes_needed, file=sys.stderr)
node_names = gen_node_names(nodes_needed)
inventory = create_inventory(node_names)
limit_arg = ':'.join(node_names)
try:
run_playbook('spawn.yml', inventory.name)
run_playbook('spawn.yml', inventory.name, args=['--limit', limit_arg])
except subprocess.CalledProcessError as exc:
# indicate that spawning sohuld be retried
print("WARNING: spawn failed with code %s\n" % exc.returncode, file=sys.stderr)
destroy(node_names)
sys.exit(2)
finally:
inventory.close()
print(' '.join(node_names))
print(' '.join(node_names))


def destroy(nodes):
limit_arg = ':'.join(nodes)
try:
run_playbook('leave.yml', 'inventory', args=['--limit', limit_arg])
run_playbook('suspend.yml', args=['--limit', limit_arg])
except subprocess.CalledProcessError as exc:
print("WARNING: `docker swarm leave` failed with code %s\n" % exc.returncode, file=sys.stderr)
run_playbook('delete.yml', 'inventory', args=['--limit', limit_arg])
print("WARNING: `docker swarm leave` or instance suspend failed with code %s\n" % exc.returncode, file=sys.stderr)
try:
cmd = DOCKER_CMD + ['node', 'rm'] + nodes
run_cmd(cmd)
except subprocess.CalledProcessError as exc:
print("WARNING: `docker node rm` failed with code %s\n" % exc.returncode, file=sys.stderr)
cmd = DOCKER_CMD + ['node', 'rm', '-f'] + nodes
run_cmd(cmd)
print(' '.join(nodes))


Expand Down
12 changes: 0 additions & 12 deletions roles/usegalaxy_swarmscale/files/inventory.head

This file was deleted.

21 changes: 0 additions & 21 deletions roles/usegalaxy_swarmscale/files/inventory.spawn

This file was deleted.

7 changes: 7 additions & 0 deletions roles/usegalaxy_swarmscale/files/inventory/openstack.yml
@@ -0,0 +1,7 @@
---

plugin: openstack
only_clouds:
- jetstream_iu
private: no
expand_hostvars: no
10 changes: 0 additions & 10 deletions roles/usegalaxy_swarmscale/files/leave.yml

This file was deleted.

0 comments on commit 50602ce

Please sign in to comment.