Skip to content

Commit

Permalink
Merge pull request #86 from clusterinthecloud/aws_module_port
Browse files Browse the repository at this point in the history
Use python-citc module to start AWS nodes
  • Loading branch information
milliams committed Oct 19, 2020
2 parents 31ea2db + 23f9368 commit 30f84f9
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 53 deletions.
1 change: 0 additions & 1 deletion .github/workflows/main.yml
Expand Up @@ -21,7 +21,6 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install oci pytest-mock pytest-asyncio pyyaml ansible-lint mypy requests_mock google-api-python-client google-auth boto3 "boto3-stubs[ec2,route53]"
python -m mypy_boto3
- name: Test with pytest
run: pytest
- name: Ansible lint
Expand Down
66 changes: 16 additions & 50 deletions roles/slurm/files/citc_aws.py
Expand Up @@ -5,54 +5,22 @@
import asyncio

import boto3
import citc.aws
import citc.cloud
import citc.utils
#from mypy_boto3 import ec2, route53


def load_yaml(filename: str) -> dict:
with open(filename, "r") as f:
return yaml.safe_load(f)


def get_nodespace(file="/etc/citc/startnode.yaml") -> Dict[str, str]:
"""
Get the information about the space into which we were creating nodes
This will be static for all nodes in this cluster
"""
return load_yaml(file)


def get_node(client, hostname: str, cluster_id: str): # -> ec2.Instance?
instance = client.describe_instances(Filters=[
{
"Name": "tag:Name",
"Values": [hostname],
},
{
"Name": "tag:cluster",
"Values": [cluster_id],
},
{
"Name": "instance-state-name",
"Values": ["pending", "running", "shutting-down", "stopping"],
},
])
# TODO check for multiple returned matches
if instance["Reservations"]:
return instance["Reservations"][0]["Instances"][0]
return None


def get_node_state(client, hostname: str, cluster_id: str) -> Optional[str]:
def get_node_state(client, hostname: str, nodespace: Dict[str, str]) -> citc.cloud.NodeState:
"""
Get the current node state of the VM for the given hostname
If there is no such VM, return "TERMINATED"
If there is no such VM, return TERMINATED
"""

item = get_node(client, hostname, cluster_id)

if item is not None:
return item['State']["Name"]
return None
try:
return citc.aws.AwsNode.from_name(hostname, client, nodespace).state
except citc.aws.NodeNotFound:
return citc.cloud.NodeState.TERMINATED


def get_node_features(hostname):
Expand Down Expand Up @@ -221,12 +189,12 @@ async def start_node(log, host: str, nodespace: Dict[str, str], ssh_keys: str) -

client = ec2_client(region)

while get_node_state(client, host, nodespace["cluster_id"]) in ["shutting-down", "stopping"]:
while get_node_state(client, host, nodespace) in [citc.cloud.NodeState.TERMINATING, citc.cloud.NodeState.STOPPING]:
log.info(" host is currently being deleted. Waiting...")
await asyncio.sleep(5)

node_state = get_node_state(client, host, nodespace["cluster_id"])
if node_state in ["pending", "running"]:
node_state = get_node_state(client, host, nodespace)
if node_state in [citc.cloud.NodeState.PENDING, citc.cloud.NodeState.RUNNING]:
log.warning(f" host already exists with state {node_state}")
return

Expand Down Expand Up @@ -258,7 +226,7 @@ async def start_node(log, host: str, nodespace: Dict[str, str], ssh_keys: str) -

def terminate_instance(log, hosts, nodespace=None):
if not nodespace:
nodespace = get_nodespace()
nodespace = citc.utils.get_nodespace()

region = nodespace["region"]

Expand All @@ -268,13 +236,11 @@ def terminate_instance(log, hosts, nodespace=None):
log.info(f"Stopping {host}")

try:
instance = get_node(client, host, nodespace["cluster_id"])
instance_id = instance["InstanceId"]
vm_ip = instance["PrivateIpAddress"]
instance = citc.aws.AwsNode.from_name(host, client, nodespace)
fqdn = f"{host}.{nodespace['dns_zone']}"
client.terminate_instances(InstanceIds=[instance_id])
client.terminate_instances(InstanceIds=[instance.id])
r53_client = route53_client()
delete_dns_record(r53_client, nodespace["dns_zone_id"], fqdn, "A", vm_ip, 300)
delete_dns_record(r53_client, nodespace["dns_zone_id"], fqdn, "A", instance.ip, 300)
except Exception as e:
log.error(f" problem while stopping: {e}")
continue
Expand Down
3 changes: 2 additions & 1 deletion roles/slurm/files/startnode.py
Expand Up @@ -4,6 +4,7 @@
import logging
import subprocess
import sys
import citc.utils
import citc_cloud


Expand All @@ -17,7 +18,7 @@ def handle_exception(exc_type, exc_value, exc_traceback):


async def main() -> None:
nodespace = citc_cloud.get_nodespace()
nodespace = citc.utils.get_nodespace()

keys_file = "/home/slurm/opc_authorized_keys"

Expand Down
2 changes: 1 addition & 1 deletion roles/slurm/tasks/elastic.yml
Expand Up @@ -3,7 +3,7 @@
pip:
name:
- PyYAML
- https://github.com/clusterinthecloud/python-citc/releases/download/0.3.5/citc-0.3.5-py3-none-any.whl
- https://github.com/clusterinthecloud/python-citc/releases/download/0.3.9/citc-0.3.9-py3-none-any.whl
virtualenv: /opt/cloud_sdk
virtualenv_command: "{{ python_venv }}"
register: cloud_sdk_venv
Expand Down

0 comments on commit 30f84f9

Please sign in to comment.